def _preprocess(self, text, lang): dic_query = {} s_tags = XmlUtils.extract_tags(text) if not s_tags: dic_query['query'] = text else: dic_query['query'] = XmlUtils.strip_tags( text) # split tag to do the match dic_query['tokenizer'] = TMUtilsMatching.pre_process( dic_query['query'], self.src_lang, 'tokenizer', {}) dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'], lang, 'pos_tagger', {}) dic_query['universal'] = TMUtilsMatching.segment_2_universal( dic_query['tokenizer'].lower(), dic_query['pos'], lang) # universal_text[0] dic_query['universal'] = dic_query['pos'] regex_class = TMRegexMatch( self.src_lang, self.tgt_lang) # Class to improve fuzzy match dic_query['query_re'] = TMUtilsMatching.pre_process( dic_query['tokenizer'], self.src_lang, 'reg_exp', regex_class.re_pp) return dic_query
def check_query_parameters(self): if 'pos' not in self.query_dic: # Applied pos and universal on query --> only the firt time if 'tokenizer' not in self.query_dic: # The first transformation is posTag --> any other were applied query_out_tags = XmlUtils.replace_tags(self.query_dic['query']) self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
def _preprocess(self): self.query_dic['query'] = self.query if re.search("<.*>", self.query): # Uniform tags --> # Yo tengo un <b>gato</b>. --> Yo tengo un <T1>gato</T1> self.query_dic['query_tags'] = TMUtilsMatching.pre_process(self.query, (self.src_lang, self.tgt_lang), 'tags', {}) self.query_dic['query'] = self.query_dic['query_tags'] # query now have the tags <T1>gato</T1> if 'regex' in self.pipe: self.query_dic['query_re'] = TMUtilsMatching.pre_process(self.query_dic['query'], self.src_lang, 'reg_exp', self.match['regex'].re_pp) else: self.query_dic['query_re'] = self.query_dic['query'] self.query_dic['query_re_reduce'] = TMRegexMatch.simplified_name(self.query_dic['query_re']) return self.query_dic
def _deals_output(self, segment, editD, trans_segments, status_tokenizer, status): if self.out == 'moses': # Moses output is tokenizer if status_tokenizer == False:# tokenize output segment.source_text = TMUtilsMatching.pre_process(segment.source_text, self.src_lang, 'tokenizer', {}) segment.target_text = TMUtilsMatching.pre_process(segment.target_text, self.tgt_lang, 'tokenizer', {}) trans_segments.append((segment, editD)) return trans_segments, 'break' else: if status_tokenizer == True: # TM output is untokenizer segment.target_text = TMUtilsMatching.pre_process(segment.target_text.split(' '), self.tgt_lang, 'untokenizer', {}) segment.source_text = TMUtilsMatching.pre_process(segment.source_text.split(' '), self.src_lang, 'untokenizer', {}) trans_segments.append((segment, editD)) if status == 'translate': status = 'break' else: status = 'continue' #if editD == 100: # Add this if to obtain better matching time # status = 'break' logging.info("Final Output (Query -- Source -- Target): {} {} {}".format(safe_str(self.query_dic['query'] + ' -- '), safe_str(segment.source_text + ' -- '), safe_str(segment.target_text))) return trans_segments, status
def clause_chunk(self, text): dicSegment_Rules = {} # Check if need to transfer to universal posTag if self.class_lang == 'generic_geral' and self.lang.upper( ) not in TMUtilsMatching.pre_process(' ', self.lang.upper(), 'get_lang_universalPOS', {}): text_Universal = TMUtilsMatching.pre_process( [[[word, pos] for word, pos in text]], self.lang, 'universal_pos_tagger', {}) if not text_Universal: # If rhere are some problem with universal posTag return [Tree('S', text)] text = [(word, pos) for word, pos in text_Universal] #Run each rule for r in self.order: if r == 'initial': lSentences = [ text ] # --> Lista inicial de segmentos a serem processados else: chunkO = RegexpChunkParser( self.rules[r].get_rule_obj(), chunk_label='splitPoint', root_label='S') # Create chunk Object #Process to chunk the segments --> Call each rule in recursive form lChunk_Segments = lSentences len_actual = 0 #--> Control the split number len_previous = len(lSentences) while len_actual != len_previous: len_previous = len(lChunk_Segments) lChunk_Segments = self._recursive_rule( lChunk_Segments, chunkO) len_actual = len(lChunk_Segments) dicSegment_Rules[r] = lChunk_Segments lSentences = lChunk_Segments # --> Load all chunks obtain by one rule self.timer.print() return dicSegment_Rules['last']
def _validate_pipe(self, pipe): match_process = { 'regex': None, 'posTag': None, 'tags': TMTags() } try: match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except ValueError: if 'regex' in pipe: pipe.pop(pipe.index('regex')) logging.info("Unsupported regex for matching") query_out_tags = XmlUtils.replace_tags(self.query) try: if 'tokenizer' not in self.query_dic: self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) logging.info("Loading Tokenizer for {}".format(self.src_lang)) try: if 'pos' not in self.query_dic: self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported posTag for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported Tokenizer for {}".format(self.src_lang)) return match_process, pipe
def _prepare_target_text(self, query, segment, translation, source_lang, target_lang): segment.source_text = query segment.domain = [] segment.file_name = [] if re.search("</?[^<>]+/?>", query) is not None: # If there are tags on query tgt_tags = TMUtilsMatching.transfer_tags( segment.source_text, translation, (source_lang, target_lang)) segment.target_text = TMUtilsMatching.pre_process( tgt_tags.split(' '), target_lang, 'untokenizer', {}) else: segment.target_text = translation.strip('\n') logging.info("Translate less minumum_match : {} {}".format( segment.source_text + ' -- ', translation)) return segment
def _match_rank(self, best_segments): self.timer.start("rank segments") editD_score = [] if 'query_tags' in self.query_dic: # Simplified tags query = TMUtilsMatching.reduce_tags(self.query_dic['query_tags']) # Yo tengo un <T1>gato</T1>. Yo tengo un T gato T. else: query = self.query_dic['query'] for i in range(0, len(best_segments)): segment = best_segments[i] # Simplified tags in tm source if re.search("</?T[0-9]*/?>", segment[0].source_text): src_text = TMUtilsMatching.reduce_tags(segment[0].source_text) # Simplified tags in tm source and target else: src_text = segment[0].source_text # Applied Regex and simplified if 'regex' in self.pipe: src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) else: src_re = src_text src_re_reduce = TMRegexMatch.simplified_name(src_re) best_segments[i] = (segment[0], segment[1], src_re, src_re_reduce) editD_score.append(self._tm_edit_distance(query, src_text, self.query_dic['query_re_reduce'], src_re_reduce)) # EditD with tags simplied TMUtilsMatching._edit_distance(query, src_text) self.timer.stop("rank segments") return sorted(zip(best_segments, editD_score), key=operator.itemgetter(1), reverse=True)
def _match(self): #Create dictionary con query info (posTag and universal) if self.split_type == 'sentence': list_info_query = [{'tokenizer': self.list_query[j]} for j in range(0, len(self.list_query))] else: list_info_query = [{'tokenizer': self.list_query[j], 'pos': self.list_pos [j]} for j in range(0, len(self.list_query))] # Query Elasticsearch --> out=moses to return only one segment l_best_segments = self.tmdb_api.query([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in self.list_query], list_info_query, (self.src_lang, self.tgt_lang), pipe=['regex', 'tags', 'posTag'], out='moses', limit=5, domains=None, min_match=80, concordance=False, aut_trans=False, exact_length=False) join_source = '' join_target = '' total_match = 0 for i in range(0, len(l_best_segments)): if l_best_segments[i]: segment, match = l_best_segments[i][0] join_source = join_source + ' ' + segment.source_text join_target = join_target + ' ' + segment.target_text else: join_source = join_source + ' ' + self.list_query[i] join_target = join_target + ' ' + self.list_query[i] match = 0 total_match = total_match + match if self.split_type == 'phrase': if self.list_marks: if self.list_marks[0]: mark = self.list_marks.pop(0)[0] join_source = join_source + ' ' + mark join_target = join_target + ' ' + mark total_match = total_match/len(self.list_query) #print(join_source + ' ---- ' + join_target + ' ---- ' + str(total_match)) return join_source, join_target, int(math.floor(total_match))
def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal): logging.info("Applied match PIPE") tgt_text = segment.target_text src_text = segment.source_text status = '' editD = ini_editD status_tokenizer = False if equal: if self.query == src_text: return segment, editD, 'find', equal, status_tokenizer else: equal = False if not equal: for op in self.pipe: #Indicate by parameters if op == 'regex': if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re']) logging.info("Applied Regex") self.timer.start("_regx_match") # ************************** Compare query_re with src_re --> simplified match = ini_editD if src_re != src_text: if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower(): # With simplified regular expression and in lowercase match = 100 # Perfect match tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text) ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) if match == 100: status = 'find' self.timer.stop("_regx_match") if op == 'tags': logging.info("Delete Tags") self.timer.start("_tags_match") src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD) logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("_tags_match") if op == 'posTag': self.timer.start("fuzzy_match") upper = False if segment.source_pos is not None and segment.target_pos is not None: # This part need the pos tagger annotation squery, tok_query, pos_query = self.check_query_parameters() logging.info("Apply posTag matching") self.timer.start("fuzzy_preprocess") if status_tokenizer == False: # Tokenize source and target tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {}) # Tokenize the simplified query status_tokenizer = True if 'universal' not in self.query_dic: self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang) #print(self.query_dic['universal']) src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang) # [word, pos] tm_src segment tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang) # [word, pos] tm_tgt segment self.timer.stop("fuzzy_preprocess") if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list): logging.info("Check unmatch word --> PosTag") if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text): logging.info("Query and source have same length or only one difference") self.timer.start("search unmatch") tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features) self.timer.stop("search unmatch") logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD))) self.timer.start("create target unmatch") if src_un_match is not None: # Create new src src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag) # Improve edit distance src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) src_re_reduce = TMRegexMatch.simplified_name(src_re) penalize_match = self._improve_match(src_un_match, operation) ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match # match # Create new tgt if tgt_un_match is not None: tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag) # tgt_word, self.timer.stop("create target unmatch") logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("fuzzy_match") # Check if find or break some transformation if ini_editD > editD: editD = ini_editD if status == 'find' or status == 'break': segment.source_text = src_text segment.target_text = tgt_text return segment, editD, status, equal, status_tokenizer if editD >= self.min_match: segment.source_text = src_text segment.target_text = tgt_text status = 'find' else: #Call split rules if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language and self.query_dic['pos'] src_text = None tgt_text = None editSplit = 0 # Split by sentences. list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {}) logging.info("split by Sentences : {} ".format(list_sentences)) # Check sentence first if len(list_sentences) > 1: split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() #print('*****Only sentences *****') #print(src_text) #print(tgt_text) #print(editSplit) if editSplit >= self.min_match: # Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit else: # Split in small phrase # Check if exist split for an especific pairs of languages lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang) if lang_class: logging.info("Split Query by Phrase") all_split, all_marks = self._splitByPhrase(lang_class, list_sentences) # Check if any split rule was applied if len(all_split) > 1: # print(list_query_split) split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() if editSplit >= self.min_match: #Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit if editD >= self.min_match: status = 'find' status_tokenizer = True else: if not self.trans_segments: #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return [] #logging.info("Prepare Automatic Translation : ") self.trans_segments.append((segment, editD)) status = 'break' # If exist segment on the list, break the for and there aren't translation return segment, editD, status, equal, status_tokenizer
def execute(self, threshold, l_best_segments, match_process, align_features, concordance): #, output self.timer.start("preprocess") query_dic = self._preprocess( self.query, self.src_lang) # Tokenize, posTag and universal query string self.timer.stop("preprocess") if concordance: return self._match_rank_concordance(l_best_segments) else: rank_segments = self._match_rank(l_best_segments, threshold) trans_segments = [] # Check if the retrieve segments are 100% match or apply transformations for segment in rank_segments: #segment = segment[0] if segment.source_text == self.query: # 100% match --> Return match considering domain ter = 100 if self.query.isupper(): segment.source_text = segment.source_text.upper() if self.query.islower(): segment.source_text = segment.source_text.lower() #trans_segments.append((segment,ter)) else: #Pre-process source and target tgt_text = TMUtilsMatching.pre_process( segment.target_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process( segment.source_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src if 'regex' in match_process: if (query_dic['tokenizer'] == query_dic['query_re']): ter = TMUtilsMatching._ter_score( query_dic['tokenizer'], src_text) # Regex did't applied on query else: self.timer.start("_regx_match") tgt_text, src_text, ter = self._regx_match( query_dic, src_text, tgt_text ) #, segment.source_pos, segment.target_pos self.timer.stop("_regx_match") logging.info( "Applied Regex Segment: {} {} {}".format( tgt_text, src_text, str(ter))) else: ter = TMUtilsMatching._ter_score( query_dic['tokenizer'], src_text) # Regex did't enter as a parameter if ter < threshold: logging.info("TER less threshold: {} ".format( str(ter))) continue if 'posTag' in match_process and ter != 100: #Check segments with only one difference if segment.source_pos is not None and segment.target_pos is not None: #This part need the pos tagger annotation self.timer.start("fuzzy_match") #target_word (to D, R, or I), target_position, operation(R I or D),src_un_match(some time have source or query information) tgt_word, tgt_position, operation, src_un_match, src_position = self._combine_feature_match( query_dic, tgt_text, src_text, segment.source_pos, segment.target_pos, align_features) logging.info("Un_match: {} {} ".format( tgt_word, operation)) if src_un_match is not None: src_text = self._create_target_expression( src_text, src_position, operation, src_un_match, 'source') #src_un_match, # src_text = src_text.split(' ') # if operation == 'R': # src_text[int(src_position.split(' _ ')[1])] = tgt_word # if operation == 'I': # new_src_text = src_text[:int(src_position)] + [src_un_match] + src_text[int(src_position):] # #new_src_text.append(src_un_match) # #new_src_text = new_src_text + src_text[int(src_position):] # src_text = new_src_text # if operation == 'D': # src_text.pop(int(src_position)) # src_text = ' '.join(src_text) if tgt_word is not None: tgt_text = self._create_target_expression( tgt_text, tgt_position, operation, src_un_match, 'target') #tgt_word, self.timer.stop("fuzzy_match") segment.source_text = TMUtilsMatching.pre_process( src_text.split(' '), self.src_lang, 'untokenizer', {}) segment.target_text = TMUtilsMatching.pre_process( tgt_text.split(' '), self.tgt_lang, 'untokenizer', {}) logging.info("Target segment: {}".format( segment.target_text)) if self.query.isupper(): segment.source_text = segment.source_text.upper() segment.target_text = segment.target_text.upper() if self.query.islower(): segment.source_text = segment.source_text.lower() segment.target_text = segment.target_text.lower() trans_segments.append((segment, ter)) return trans_segments
# Tokenizer tok = TMTokenizer(lang.upper()).tokenizer pos = TMPosTagger(lang.upper()) for eline in file.readlines(): tok_sentences = tok.process(eline) print(tok_sentences) pos_sentence = [ element for word, element in pos.tag_segments([tok_sentences])[0] ] # Split several steps list_sentences = TMUtilsMatching.pre_process( tok_sentences, args.source, 'split_sentences', {}) #print('+++++++++++++++++') #print(list_sentences) list_word_pos = [] if list_sentences: i = 0 for each_sent in list_sentences: # Create word_pos len_e = len(each_sent.split()) list_word_pos.append([ (w, p) for w, p in zip(each_sent.split(), pos_sentence[i:i + len_e]) ]) i = i + len_e