def _regx_match(self, query_dic, src_text, tgt_text): #, src_pos, tgt_pos logging.info("Applied Regex") fuzzy_alg = TMRegexMatch( self.src_lang, self.tgt_lang ) # Class to improve fuzzy match #, self.best_segments return fuzzy_alg.process(query_dic, src_text, tgt_text) #, src_pos, tgt_pos
def token_count(self, text, lang): lang = lang.split('-')[0].upper() if not lang in self.regex: try: self.regex[lang] = TMRegExpPreprocessor(lang) logging.info("Loading Regex for {}".format(lang)) except Exception as e: logging.info("Unsupported Regex for {} ".format(lang)) self.regex[lang] = lang if not lang in self.tokenizers: try: self.tokenizers[lang] = TMTokenizer(lang) logging.info("Loading Tokenizer for {}".format(lang)) except Exception as e: self.tokenizers[lang] = lang logging.info("Unsupported Tokenizer for {}".format(lang)) if self.regex[lang] != lang: text = TMRegexMatch.simplified_name(self.regex[lang].process(text)) if self.tokenizers[lang] != lang: token_cnt = len((self.tokenizers[lang].tokenizer.process(text)).split(' ')) else: if ' ' in text: token_cnt = len(text.split(' ')) else: token_cnt = 1 return token_cnt#len((self.tokenizers[lang].tokenizer.process(TMRegexMatch.simplified_name(self.regex[lang].process(text)))).split(' '))
def _preprocess(self, text, lang): dic_query = {} s_tags = XmlUtils.extract_tags(text) if not s_tags: dic_query['query'] = text else: dic_query['query'] = XmlUtils.strip_tags( text) # split tag to do the match dic_query['tokenizer'] = TMUtilsMatching.pre_process( dic_query['query'], self.src_lang, 'tokenizer', {}) dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'], lang, 'pos_tagger', {}) dic_query['universal'] = TMUtilsMatching.segment_2_universal( dic_query['tokenizer'].lower(), dic_query['pos'], lang) # universal_text[0] dic_query['universal'] = dic_query['pos'] regex_class = TMRegexMatch( self.src_lang, self.tgt_lang) # Class to improve fuzzy match dic_query['query_re'] = TMUtilsMatching.pre_process( dic_query['tokenizer'], self.src_lang, 'reg_exp', regex_class.re_pp) return dic_query
def _preprocess(self): self.query_dic['query'] = self.query if re.search("<.*>", self.query): # Uniform tags --> # Yo tengo un <b>gato</b>. --> Yo tengo un <T1>gato</T1> self.query_dic['query_tags'] = TMUtilsMatching.pre_process(self.query, (self.src_lang, self.tgt_lang), 'tags', {}) self.query_dic['query'] = self.query_dic['query_tags'] # query now have the tags <T1>gato</T1> if 'regex' in self.pipe: self.query_dic['query_re'] = TMUtilsMatching.pre_process(self.query_dic['query'], self.src_lang, 'reg_exp', self.match['regex'].re_pp) else: self.query_dic['query_re'] = self.query_dic['query'] self.query_dic['query_re_reduce'] = TMRegexMatch.simplified_name(self.query_dic['query_re']) return self.query_dic
def _validate_pipe(self, pipe): match_process = { 'regex': None, 'posTag': None, 'tags': TMTags() } try: match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except ValueError: if 'regex' in pipe: pipe.pop(pipe.index('regex')) logging.info("Unsupported regex for matching") query_out_tags = XmlUtils.replace_tags(self.query) try: if 'tokenizer' not in self.query_dic: self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) logging.info("Loading Tokenizer for {}".format(self.src_lang)) try: if 'pos' not in self.query_dic: self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported posTag for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported Tokenizer for {}".format(self.src_lang)) return match_process, pipe
def _match_rank(self, best_segments): self.timer.start("rank segments") editD_score = [] if 'query_tags' in self.query_dic: # Simplified tags query = TMUtilsMatching.reduce_tags(self.query_dic['query_tags']) # Yo tengo un <T1>gato</T1>. Yo tengo un T gato T. else: query = self.query_dic['query'] for i in range(0, len(best_segments)): segment = best_segments[i] # Simplified tags in tm source if re.search("</?T[0-9]*/?>", segment[0].source_text): src_text = TMUtilsMatching.reduce_tags(segment[0].source_text) # Simplified tags in tm source and target else: src_text = segment[0].source_text # Applied Regex and simplified if 'regex' in self.pipe: src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) else: src_re = src_text src_re_reduce = TMRegexMatch.simplified_name(src_re) best_segments[i] = (segment[0], segment[1], src_re, src_re_reduce) editD_score.append(self._tm_edit_distance(query, src_text, self.query_dic['query_re_reduce'], src_re_reduce)) # EditD with tags simplied TMUtilsMatching._edit_distance(query, src_text) self.timer.stop("rank segments") return sorted(zip(best_segments, editD_score), key=operator.itemgetter(1), reverse=True)
def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal): logging.info("Applied match PIPE") tgt_text = segment.target_text src_text = segment.source_text status = '' editD = ini_editD status_tokenizer = False if equal: if self.query == src_text: return segment, editD, 'find', equal, status_tokenizer else: equal = False if not equal: for op in self.pipe: #Indicate by parameters if op == 'regex': if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re']) logging.info("Applied Regex") self.timer.start("_regx_match") # ************************** Compare query_re with src_re --> simplified match = ini_editD if src_re != src_text: if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower(): # With simplified regular expression and in lowercase match = 100 # Perfect match tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text) ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) if match == 100: status = 'find' self.timer.stop("_regx_match") if op == 'tags': logging.info("Delete Tags") self.timer.start("_tags_match") src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD) logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("_tags_match") if op == 'posTag': self.timer.start("fuzzy_match") upper = False if segment.source_pos is not None and segment.target_pos is not None: # This part need the pos tagger annotation squery, tok_query, pos_query = self.check_query_parameters() logging.info("Apply posTag matching") self.timer.start("fuzzy_preprocess") if status_tokenizer == False: # Tokenize source and target tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {}) # Tokenize the simplified query status_tokenizer = True if 'universal' not in self.query_dic: self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang) #print(self.query_dic['universal']) src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang) # [word, pos] tm_src segment tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang) # [word, pos] tm_tgt segment self.timer.stop("fuzzy_preprocess") if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list): logging.info("Check unmatch word --> PosTag") if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text): logging.info("Query and source have same length or only one difference") self.timer.start("search unmatch") tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features) self.timer.stop("search unmatch") logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD))) self.timer.start("create target unmatch") if src_un_match is not None: # Create new src src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag) # Improve edit distance src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) src_re_reduce = TMRegexMatch.simplified_name(src_re) penalize_match = self._improve_match(src_un_match, operation) ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match # match # Create new tgt if tgt_un_match is not None: tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag) # tgt_word, self.timer.stop("create target unmatch") logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("fuzzy_match") # Check if find or break some transformation if ini_editD > editD: editD = ini_editD if status == 'find' or status == 'break': segment.source_text = src_text segment.target_text = tgt_text return segment, editD, status, equal, status_tokenizer if editD >= self.min_match: segment.source_text = src_text segment.target_text = tgt_text status = 'find' else: #Call split rules if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language and self.query_dic['pos'] src_text = None tgt_text = None editSplit = 0 # Split by sentences. list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {}) logging.info("split by Sentences : {} ".format(list_sentences)) # Check sentence first if len(list_sentences) > 1: split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() #print('*****Only sentences *****') #print(src_text) #print(tgt_text) #print(editSplit) if editSplit >= self.min_match: # Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit else: # Split in small phrase # Check if exist split for an especific pairs of languages lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang) if lang_class: logging.info("Split Query by Phrase") all_split, all_marks = self._splitByPhrase(lang_class, list_sentences) # Check if any split rule was applied if len(all_split) > 1: # print(list_query_split) split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() if editSplit >= self.min_match: #Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit if editD >= self.min_match: status = 'find' status_tokenizer = True else: if not self.trans_segments: #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return [] #logging.info("Prepare Automatic Translation : ") self.trans_segments.append((segment, editD)) status = 'break' # If exist segment on the list, break the for and there aren't translation return segment, editD, status, equal, status_tokenizer