def _preprocess(self, text, lang): dic_query = {} s_tags = XmlUtils.extract_tags(text) if not s_tags: dic_query['query'] = text else: dic_query['query'] = XmlUtils.strip_tags( text) # split tag to do the match dic_query['tokenizer'] = TMUtilsMatching.pre_process( dic_query['query'], self.src_lang, 'tokenizer', {}) dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'], lang, 'pos_tagger', {}) dic_query['universal'] = TMUtilsMatching.segment_2_universal( dic_query['tokenizer'].lower(), dic_query['pos'], lang) # universal_text[0] dic_query['universal'] = dic_query['pos'] regex_class = TMRegexMatch( self.src_lang, self.tgt_lang) # Class to improve fuzzy match dic_query['query_re'] = TMUtilsMatching.pre_process( dic_query['tokenizer'], self.src_lang, 'reg_exp', regex_class.re_pp) return dic_query
def check_query_parameters(self): if 'pos' not in self.query_dic: # Applied pos and universal on query --> only the firt time if 'tokenizer' not in self.query_dic: # The first transformation is posTag --> any other were applied query_out_tags = XmlUtils.replace_tags(self.query_dic['query']) self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
def _align_source_target(self, un_match, un_pos, position, tgt_text, tgt_pos, align_features): tgt_dic = {} # list of pairs of words tgt_word_pos = TMUtilsMatching.segment_2_universal( tgt_text.lower(), tgt_pos, self.tgt_lang) for i in range(0, len(tgt_word_pos)): value_similarity = 0 for f in align_features: if f == 'word_ter': # TER between words value_similarity = value_similarity + TMUtilsMatching.ter_distance( un_match, tgt_word_pos[i][0]) if f == 'posTag': # Boolean PosTag value_similarity = value_similarity + TMUtilsMatching.pos_bool( un_pos, tgt_word_pos[i][1]) if f == 'position': # Word position value_similarity = value_similarity + TMUtilsMatching.position_distance( position, i) #if f == 'frequency': # frequency of pairs of words # value_similarity = value_similarity + self.target_importance(un_word, tgt_word_pos[i][0], segment,best_segments) # Dictionary have the target word and the position of the word in the target sentence tgt_dic[(tgt_word_pos[i][0], i)] = value_similarity tgt_align = sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)[0] # Select the highest score return tgt_align[0][0], tgt_align[0][ 1] # un_word, un_position # Retorn the word with biggest score
def process(self, query_dic, tgt_text, src_text, src_pos, tgt_pos, align_features): tgt_word = None tgt_position = None operation = None un_match = None src_position = None src_word_pos = TMUtilsMatching.segment_2_universal( src_text.lower(), src_pos, self.src_lang ) #self._segment_2_universal(segment.source_text, segment.source_pos, self.src_lang) # [word, pos] tm_src segment query_universal = [] #Check if segments are equal of with only one diference (posTag) query_tok = query_dic['tokenizer'].lower() for i in range(0, len(query_dic['universal'].split(' '))): query_universal.append([ query_tok.split(' ')[i], query_dic['universal'].split(' ')[i] ]) #list(zip(query_dic['tokenizer'].split(' '), query_dic['universal'].split(' '))) logging.info("Differences between PosTag: {} ".format( TMUtilsMatching.len_compare(query_universal, src_word_pos))) if TMUtilsMatching.len_compare( query_universal, src_word_pos) is True and (query_dic['tokenizer'] != src_text): # Obtain un_match word and its features if len(query_universal) == len(src_word_pos): operation = 'R' # Load the unmatch between query and src --> un_match = un_match_q _ un_match_s un_match, un_pos, src_position = TMPosMatch._get_src_unmatch( query_universal, src_word_pos) # Replace (query and src) if un_match is not None: tgt_word, tgt_position = self._align_source_target( un_match.split('_')[1], un_pos.split('_')[1], src_position.split('_')[1], tgt_text, tgt_pos, align_features) tgt_word = un_match.split('_')[0] elif len(query_universal) > len( src_word_pos): # Insert a new word in target operation = 'I' un_match, un_pos, src_position = TMPosMatch._get_src_unmatch( query_universal, src_word_pos) # Insert --> return word from query tgt_word = un_match tgt_position = src_position else: # Delete a new word in target operation = 'D' un_match, un_pos, src_position = TMPosMatch._get_src_unmatch( src_word_pos, query_universal) # Delete --> return word from src if un_match is not None: tgt_word, tgt_position = self._align_source_target( un_match, un_pos, src_position, tgt_text, tgt_pos, align_features) return tgt_word, tgt_position, operation, un_match, src_position
def _align_source_target(self, un_match, un_pos, position, tgt_word_pos, align_features): #tgt_text, tgt_pos, related_words = [] tgt_dic = {} # list of pairs of words equal_posTag = [[ position_tgt, word, pos ] for position_tgt, [word, pos] in list(enumerate(tgt_word_pos)) if pos == un_pos.strip(' ') or pos == 'VERB' or pos == 'NOUN' or pos == 'ADJ'] #print('*************') #print(equal_posTag) if not equal_posTag: return None, None else: if 'glossary' in align_features: related_words = self.search_exact_value(un_match, 10) for i in range(0, len(equal_posTag)): value_similarity = 0 for f in align_features: if f == 'word_ter': # TER between words value_similarity = value_similarity + ( 0.25 * TMUtilsMatching.un_match_distance( un_match, equal_posTag[i][1])) if f == 'posTag': # Boolean PosTag value_similarity = value_similarity + ( 0.25 * TMUtilsMatching.pos_bool( un_pos, equal_posTag[i][2])) if f == 'position': # Word position value_similarity = value_similarity + ( 0.25 * TMUtilsMatching.position_distance( position, equal_posTag[i][0])) if f == 'glossary': # search word on elasticTM if equal_posTag[i][1] in related_words: is_related = 1 else: is_related = 0 value_similarity = value_similarity + ( 0.25 * is_related ) #target_importance(un_word, tgt_word_pos[i][0], segment,best_segments) # Dictionary have the target word and the position of the word in the target sentence --> Low is the best tgt_dic[(equal_posTag[i][1], equal_posTag[i][0])] = value_similarity tgt_align = sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)[0] # Select the highest score print( sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)) if tgt_align[1] > G_CONFIG.get_src_tgt_threshold(): return tgt_align[0][0], tgt_align[0][1] else: return None, None
def _preprocess(self): self.query_dic['query'] = self.query if re.search("<.*>", self.query): # Uniform tags --> # Yo tengo un <b>gato</b>. --> Yo tengo un <T1>gato</T1> self.query_dic['query_tags'] = TMUtilsMatching.pre_process(self.query, (self.src_lang, self.tgt_lang), 'tags', {}) self.query_dic['query'] = self.query_dic['query_tags'] # query now have the tags <T1>gato</T1> if 'regex' in self.pipe: self.query_dic['query_re'] = TMUtilsMatching.pre_process(self.query_dic['query'], self.src_lang, 'reg_exp', self.match['regex'].re_pp) else: self.query_dic['query_re'] = self.query_dic['query'] self.query_dic['query_re_reduce'] = TMRegexMatch.simplified_name(self.query_dic['query_re']) return self.query_dic
def _match_tags(query, src_text, tgt_text): match = 0 query_strip_tags = TMUtilsMatching.strip_tags( query) #Strip tags from query src_text_strip_tags = TMUtilsMatching.strip_tags( src_text).strip() # Strip tags from src tgt_text_strip_tags = TMUtilsMatching.strip_tags( tgt_text).strip() # Strip tags from tgt if query_strip_tags == src_text_strip_tags: # query and src_tm are equals match = 100 return query_strip_tags, src_text_strip_tags, tgt_text_strip_tags, match
def style_string(self, src_text, tgt_text, status_tokenizer): #Check upper and lower case if src_text and tgt_text: src_text, tgt_text = self._transform_case(src_text, tgt_text) # Transfer XML tags (if needed) self.timer.start("transfer_tags") if re.search("</?[^<>]+/?>", self.query) is not None: # transfer tags only if query has and tgt and src don't status_tokenizer = True if (re.search("</?[^<>]+/?>", src_text) is None): src_text = TMUtilsMatching.transfer_tags(self.query, src_text, (self.src_lang, self.tgt_lang)) if (re.search("</?[^<>]+/?>", tgt_text) is None): tgt_text = TMUtilsMatching.transfer_tags(self.query, tgt_text, (self.src_lang, self.tgt_lang)) self.timer.stop("transfer_tags") return src_text, tgt_text, status_tokenizer
def __init__(self): # language,# self.order = ['comma', 'conjunction', 'compose_sub', 'last'] # 'subordinate', --> Last = subordinate self.sw = TMUtilsMatching.check_stopwords( 'EN' ) #TMTextProcessors.stop_words('english') #stopwords.words('english') self.ut = TMTextProcessors.univ_pos_tagger( 'EN') #TMUniversalPosTag('EN') self.rules = { #pattern ---> left ---> right 'conjunction': RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'), #, '<CC>','','' 'last': RulesPattern('<.*>*<V.*><.*>*<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'), 'compose_sub': RulesPattern( '<.*>*<V.*><.*>*<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'), # --> wh_words <V> <NP|PP>* 'comma': RulesPattern( '<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'), #, '<\,|\;|\:|\->', '', '' }
def __init__(self, list_query, list_marks, src_lang, tgt_lang, split_type, aut_trans, domain): self.src_lang = src_lang self.tgt_lang = tgt_lang self.aut_trans = aut_trans self.domain = domain self.tmdb_api = TMDbApi.TMDbApi.TMDbApi() self.split_type = split_type if self.split_type == 'sentence': self.list_query = list_query if self.split_type == 'phrase': if not TMUtilsMatching.empty_list(list_marks): # Search its translation target lang self.list_marks = self.tgt_list_marks(list_marks) # [[], [], ('and', 'CC')] else: self.list_marks = list_marks ''' [[('-', ':'), ('A', 'DT'), ('framework', 'NN'), ('for', 'IN'), ('the', 'DT'), ('measurement', 'NN'), ('of', 'IN'), ('greenhouse', 'NN'), ('gas', 'NN'), ('concentrations', 'NNS'), ('is', 'VBZ'), ('in', 'IN'), ('place', 'NN')], [('to', 'TO'), ('understand', 'VV'), ('their', 'PP$'), ('sources', 'NNS')], [('sinks', 'NNS'), ('requires', 'VVZ'), ('measuring', 'VVG'), ('transport', 'NN'), ('and', 'CC'), ('flux', 'NN'), ('in', 'IN'), ('both', 'CC'), ('the', 'DT'), ('horizontal', 'JJ'), ('and', 'CC'), ('vertical', 'JJ'), ('.', 'SENT')]] ''' self.list_query = [' '.join([word for word, post in part]) for part in list_query] self.list_pos = [' '.join([pos for word, pos in part]) for part in list_query] logging.info("After Split Each parts: {} {}".format(self.list_query, self.list_pos))
def _improve_match(self, query_info, operation): query_word = query_info.split(' _ ') if operation == 'R': #Estimate editD between the words return (TMUtilsMatching._edit_distance(query_word[0], query_word[1]) / 2) else: return (len(query_word[0]) / 2) # EditD is equal ao total de characters add or delete from the string
def __init__(self): self.order = ['comma', 'conjunction', 'compose_sub', 'last'] #subordinate self.sw = TMUtilsMatching.check_stopwords( 'ES' ) #TMTextProcessors.stop_words('spanish')#stopwords.words('spanish') self.ut = TMTextProcessors.univ_pos_tagger( 'ES') #TMUniversalPosTag('ES') self.rules = { 'conjunction': RulesPattern( '<.*>*<V.*><.*>*<CC|CCNEG|CCAD><.*>*<V.*><.*>*', #, '<CC|CCNEG|CCAD>','','' '<.*>*<V.*><.*>*', '<CC|CCNEG|CCAD><.*>*<V.*><.*>*' ), #'?!<V.*>', '<CC|CCNEG|CCAD><.*>*' 'comma': RulesPattern( '<.*>*<V.*><.*>*<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*' ), #, '<CM|COLON|DASH|SEMICOLON>','','' 'compose_sub': RulesPattern( '<.*>*<V.*><.*>*<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*' ), # --> subordinate 'last': RulesPattern( '<.*>*<V.*><.*>*<CQUE|CSUBF|CSUBI|CSUBX><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*' ), #--> subordinate }
def _deals_output(self, segment, editD, trans_segments, status_tokenizer, status): if self.out == 'moses': # Moses output is tokenizer if status_tokenizer == False:# tokenize output segment.source_text = TMUtilsMatching.pre_process(segment.source_text, self.src_lang, 'tokenizer', {}) segment.target_text = TMUtilsMatching.pre_process(segment.target_text, self.tgt_lang, 'tokenizer', {}) trans_segments.append((segment, editD)) return trans_segments, 'break' else: if status_tokenizer == True: # TM output is untokenizer segment.target_text = TMUtilsMatching.pre_process(segment.target_text.split(' '), self.tgt_lang, 'untokenizer', {}) segment.source_text = TMUtilsMatching.pre_process(segment.source_text.split(' '), self.src_lang, 'untokenizer', {}) trans_segments.append((segment, editD)) if status == 'translate': status = 'break' else: status = 'continue' #if editD == 100: # Add this if to obtain better matching time # status = 'break' logging.info("Final Output (Query -- Source -- Target): {} {} {}".format(safe_str(self.query_dic['query'] + ' -- '), safe_str(segment.source_text + ' -- '), safe_str(segment.target_text))) return trans_segments, status
def _prepare_target_text(self, query, segment, translation, source_lang, target_lang): segment.source_text = query segment.domain = [] segment.file_name = [] if re.search("</?[^<>]+/?>", query) is not None: # If there are tags on query tgt_tags = TMUtilsMatching.transfer_tags( segment.source_text, translation, (source_lang, target_lang)) segment.target_text = TMUtilsMatching.pre_process( tgt_tags.split(' '), target_lang, 'untokenizer', {}) else: segment.target_text = translation.strip('\n') logging.info("Translate less minumum_match : {} {}".format( segment.source_text + ' -- ', translation)) return segment
def clause_chunk(self, text): dicSegment_Rules = {} # Check if need to transfer to universal posTag if self.class_lang == 'generic_geral' and self.lang.upper( ) not in TMUtilsMatching.pre_process(' ', self.lang.upper(), 'get_lang_universalPOS', {}): text_Universal = TMUtilsMatching.pre_process( [[[word, pos] for word, pos in text]], self.lang, 'universal_pos_tagger', {}) if not text_Universal: # If rhere are some problem with universal posTag return [Tree('S', text)] text = [(word, pos) for word, pos in text_Universal] #Run each rule for r in self.order: if r == 'initial': lSentences = [ text ] # --> Lista inicial de segmentos a serem processados else: chunkO = RegexpChunkParser( self.rules[r].get_rule_obj(), chunk_label='splitPoint', root_label='S') # Create chunk Object #Process to chunk the segments --> Call each rule in recursive form lChunk_Segments = lSentences len_actual = 0 #--> Control the split number len_previous = len(lSentences) while len_actual != len_previous: len_previous = len(lChunk_Segments) lChunk_Segments = self._recursive_rule( lChunk_Segments, chunkO) len_actual = len(lChunk_Segments) dicSegment_Rules[r] = lChunk_Segments lSentences = lChunk_Segments # --> Load all chunks obtain by one rule self.timer.print() return dicSegment_Rules['last']
def process(self, query_dic, src_text, tgt_text): #, src_pos, tgt_pos # pre-process and apply regex to tm_src src_re = self.re_pp[self.src_lang].process(src_text) if src_text != src_re: # Was applied regex in src tgt_re = self.re_pp[self.tgt_lang].process(tgt_text) # if query_dic['query_re'] == query_dic['tokenizer']: # Was not applied regular expression on query # if src_re is not None and src_pos is not None: # src_text, src_pos = TMRegexMatch._delete_elements(src_re.split(' '), src_pos.split(' ')) # if tgt_re is not None and tgt_pos is not None: # tgt_text, tgt_pos = TMRegexMatch._delete_elements(tgt_re.split(' '), tgt_pos.split(' ')) #ter = TMUtilsMatching._ter_score(query_dic['tokenizer'], src_text) #else: #Transform target into query ter = TMUtilsMatching._ter_score(query_dic['query_re'], src_re) #Extract patterns (find and replace) value src_query_f, src_query_r = TMRegexMatch._extract_find_replace( query_dic['tokenizer'].split(' '), query_dic['query_re'].split(' ')) tgt_query_f = src_query_f.copy() tgt_query_r = src_query_r.copy() src_f, src_r = TMRegexMatch._extract_find_replace( src_text.split(' '), src_re.split(' ')) ter = ter - len(src_f) src_text = TMRegexMatch._replace_values(src_query_f, src_query_r, src_re.split(' '), src_f, src_r) tgt_f, tgt_r = TMRegexMatch._extract_find_replace( tgt_text.split(' '), tgt_re.split(' ')) tgt_text = TMRegexMatch._replace_values(tgt_query_f, tgt_query_r, tgt_re.split(' '), tgt_f, tgt_r) else: ter = TMUtilsMatching._ter_score(query_dic['tokenizer'], src_text) #Regex did't applied return tgt_text, src_text, ter #, src_pos, tgt_pos
def _match_rank_concordance(self, best_segments): # , output self.timer.start("ter") l_ter_score = [ TMUtilsMatching._ter_score(self.query, segment[0].source_text) for segment in best_segments ] self.timer.stop("ter") l_best_sort = sorted(zip(best_segments, l_ter_score), key=operator.itemgetter(1), reverse=True) return [(segment[0][0], segment[1]) for segment in l_best_sort]
def _match_tags(self, src_text, src_re_reduce, tgt_text, status, ini_editD): reduce = False out_tags_query, src_text, tgt_text, match = TMTags._match_tags(self.query_dic['query'], src_text, tgt_text) if match == 100: status = 'find' else: match = self._tm_edit_distance(out_tags_query, src_text, TMUtilsMatching.strip_tags(self.query_dic['query_re_reduce']).strip(), TMUtilsMatching.strip_tags(src_re_reduce).strip()) if self.query_dic['query'] != out_tags_query: self.query_dic['query'] = out_tags_query reduce = True if match >= ini_editD: ini_editD = match return src_text, tgt_text, status, reduce, ini_editD
def _only_word_sequence(text, lang): # Receive original sequence only_word = [] only_st = [] l_src_st = TMUtilsMatching.check_stopwords(lang) for match in re.finditer(r'[a-zA-Z0-9\u4e00-\u9fff\u3040-\u309Fー\u30A0-\u30FF]+', text): # Get all the words and numbers if l_src_st: # For some language we don't have stopwords list if match.group() in l_src_st: only_st.append(match.group()) else: only_st.append('P') only_word.append(match.group()) return only_word, only_st
def _validate_pipe(self, pipe): match_process = { 'regex': None, 'posTag': None, 'tags': TMTags() } try: match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except ValueError: if 'regex' in pipe: pipe.pop(pipe.index('regex')) logging.info("Unsupported regex for matching") query_out_tags = XmlUtils.replace_tags(self.query) try: if 'tokenizer' not in self.query_dic: self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) logging.info("Loading Tokenizer for {}".format(self.src_lang)) try: if 'pos' not in self.query_dic: self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported posTag for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported Tokenizer for {}".format(self.src_lang)) return match_process, pipe
def _match_rank(self, best_segments): self.timer.start("rank segments") editD_score = [] if 'query_tags' in self.query_dic: # Simplified tags query = TMUtilsMatching.reduce_tags(self.query_dic['query_tags']) # Yo tengo un <T1>gato</T1>. Yo tengo un T gato T. else: query = self.query_dic['query'] for i in range(0, len(best_segments)): segment = best_segments[i] # Simplified tags in tm source if re.search("</?T[0-9]*/?>", segment[0].source_text): src_text = TMUtilsMatching.reduce_tags(segment[0].source_text) # Simplified tags in tm source and target else: src_text = segment[0].source_text # Applied Regex and simplified if 'regex' in self.pipe: src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) else: src_re = src_text src_re_reduce = TMRegexMatch.simplified_name(src_re) best_segments[i] = (segment[0], segment[1], src_re, src_re_reduce) editD_score.append(self._tm_edit_distance(query, src_text, self.query_dic['query_re_reduce'], src_re_reduce)) # EditD with tags simplied TMUtilsMatching._edit_distance(query, src_text) self.timer.stop("rank segments") return sorted(zip(best_segments, editD_score), key=operator.itemgetter(1), reverse=True)
def _match_rank(self, best_segments, threshold): #, output segments = [] self.timer.start("ter") l_ter_score = [ TMUtilsMatching._edit_distance(self.query, segment[0].source_text) for segment in best_segments ] self.timer.stop("ter") l_best_sort = sorted(zip(best_segments, l_ter_score), key=operator.itemgetter(1), reverse=True) for segment, ter in l_best_sort: # TM output --> only show segments with ter > threshold if ter >= threshold - 10: segments.append((segment[0])) else: break return segments
def __init__(self): # language,# self.order = ['comma', 'last'] # 'conjunction', self.sw = TMUtilsMatching.check_stopwords( 'EN' ) #TMTextProcessors.stop_words('english') # stopwords.words('english') self.ut = TMTextProcessors.univ_pos_tagger( 'EN') # TMUniversalPosTag('EN') self.rules = { # pattern ---> left ---> right 'comma': RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'), 'last': RulesPattern('<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'), }
def __init__(self): self.order = ['comma', 'last'] # last = conjuntion self.sw = TMUtilsMatching.check_stopwords('FR') # stop words self.ut = TMTextProcessors.univ_pos_tagger( 'FR') # TMUniversalPosTag('ES') self.rules = { 'last': RulesPattern('<.*>*<V.*><.*>*<KON><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<KON><.*>*<V.*><.*>*'), 'comma': RulesPattern( '<.*>*<V.*><.*>*<PUN|SENT><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', # Punctuation marks '<PUN|SENT><.*>*<V.*><.*>*') }
def __init__(self, lang): self.order = ['comma', 'last'] # subordinate self.sw = TMUtilsMatching.check_stopwords( lang) # stopwords.words('spanish') self.ut = TMTextProcessors.univ_pos_tagger( lang) # TMUniversalPosTag('ES') self.rules = { 'last': RulesPattern( '<.*>*<VERB><.*>*<CONJ|SCONJ><.*>*<VERB><.*>*', # conjunctions '<.*>*<VERB><.*>*', '<CONJ><.*>*<VERB><.*>*'), 'comma': RulesPattern('<.*>*<VERB><.*>*<\.><.*>*<VERB><.*>*', '<.*>*<VERB><.*>*', '<\.><.*>*<VERB><.*>*'), # comma }
def _match(self): #Create dictionary con query info (posTag and universal) if self.split_type == 'sentence': list_info_query = [{'tokenizer': self.list_query[j]} for j in range(0, len(self.list_query))] else: list_info_query = [{'tokenizer': self.list_query[j], 'pos': self.list_pos [j]} for j in range(0, len(self.list_query))] # Query Elasticsearch --> out=moses to return only one segment l_best_segments = self.tmdb_api.query([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in self.list_query], list_info_query, (self.src_lang, self.tgt_lang), pipe=['regex', 'tags', 'posTag'], out='moses', limit=5, domains=None, min_match=80, concordance=False, aut_trans=False, exact_length=False) join_source = '' join_target = '' total_match = 0 for i in range(0, len(l_best_segments)): if l_best_segments[i]: segment, match = l_best_segments[i][0] join_source = join_source + ' ' + segment.source_text join_target = join_target + ' ' + segment.target_text else: join_source = join_source + ' ' + self.list_query[i] join_target = join_target + ' ' + self.list_query[i] match = 0 total_match = total_match + match if self.split_type == 'phrase': if self.list_marks: if self.list_marks[0]: mark = self.list_marks.pop(0)[0] join_source = join_source + ' ' + mark join_target = join_target + ' ' + mark total_match = total_match/len(self.list_query) #print(join_source + ' ---- ' + join_target + ' ---- ' + str(total_match)) return join_source, join_target, int(math.floor(total_match))
def execute(self, threshold, l_best_segments, match_process, align_features, concordance): #, output self.timer.start("preprocess") query_dic = self._preprocess( self.query, self.src_lang) # Tokenize, posTag and universal query string self.timer.stop("preprocess") if concordance: return self._match_rank_concordance(l_best_segments) else: rank_segments = self._match_rank(l_best_segments, threshold) trans_segments = [] # Check if the retrieve segments are 100% match or apply transformations for segment in rank_segments: #segment = segment[0] if segment.source_text == self.query: # 100% match --> Return match considering domain ter = 100 if self.query.isupper(): segment.source_text = segment.source_text.upper() if self.query.islower(): segment.source_text = segment.source_text.lower() #trans_segments.append((segment,ter)) else: #Pre-process source and target tgt_text = TMUtilsMatching.pre_process( segment.target_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process( segment.source_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src if 'regex' in match_process: if (query_dic['tokenizer'] == query_dic['query_re']): ter = TMUtilsMatching._ter_score( query_dic['tokenizer'], src_text) # Regex did't applied on query else: self.timer.start("_regx_match") tgt_text, src_text, ter = self._regx_match( query_dic, src_text, tgt_text ) #, segment.source_pos, segment.target_pos self.timer.stop("_regx_match") logging.info( "Applied Regex Segment: {} {} {}".format( tgt_text, src_text, str(ter))) else: ter = TMUtilsMatching._ter_score( query_dic['tokenizer'], src_text) # Regex did't enter as a parameter if ter < threshold: logging.info("TER less threshold: {} ".format( str(ter))) continue if 'posTag' in match_process and ter != 100: #Check segments with only one difference if segment.source_pos is not None and segment.target_pos is not None: #This part need the pos tagger annotation self.timer.start("fuzzy_match") #target_word (to D, R, or I), target_position, operation(R I or D),src_un_match(some time have source or query information) tgt_word, tgt_position, operation, src_un_match, src_position = self._combine_feature_match( query_dic, tgt_text, src_text, segment.source_pos, segment.target_pos, align_features) logging.info("Un_match: {} {} ".format( tgt_word, operation)) if src_un_match is not None: src_text = self._create_target_expression( src_text, src_position, operation, src_un_match, 'source') #src_un_match, # src_text = src_text.split(' ') # if operation == 'R': # src_text[int(src_position.split(' _ ')[1])] = tgt_word # if operation == 'I': # new_src_text = src_text[:int(src_position)] + [src_un_match] + src_text[int(src_position):] # #new_src_text.append(src_un_match) # #new_src_text = new_src_text + src_text[int(src_position):] # src_text = new_src_text # if operation == 'D': # src_text.pop(int(src_position)) # src_text = ' '.join(src_text) if tgt_word is not None: tgt_text = self._create_target_expression( tgt_text, tgt_position, operation, src_un_match, 'target') #tgt_word, self.timer.stop("fuzzy_match") segment.source_text = TMUtilsMatching.pre_process( src_text.split(' '), self.src_lang, 'untokenizer', {}) segment.target_text = TMUtilsMatching.pre_process( tgt_text.split(' '), self.tgt_lang, 'untokenizer', {}) logging.info("Target segment: {}".format( segment.target_text)) if self.query.isupper(): segment.source_text = segment.source_text.upper() segment.target_text = segment.target_text.upper() if self.query.islower(): segment.source_text = segment.source_text.lower() segment.target_text = segment.target_text.lower() trans_segments.append((segment, ter)) return trans_segments
def _tm_edit_distance(self, q_text, s_text, q_simplified, s_simplified): # Corner case - matching artificial empty segment -> giving minimal score if q_text and not s_text.strip(): return 1 #Always reduce the tags to count only one element ''' print('**original**') print(q_text) print('**src**') print(s_text) print('**originalS**') print(q_simplified) print('**srcS**') print(s_simplified) ''' # 1) ********** Obtain words and stop words sequences q_onlyW, q_st_word = TMMatching._only_word_sequence(q_text, self.src_lang) s_onlyW, s_st_word = TMMatching._only_word_sequence(s_text, self.src_lang) ''' print(q_onlyW) print(s_onlyW) print(q_st_word) print(s_st_word) ''' if not q_onlyW and not q_st_word: #print(self.src_lang) #if self.src_lang=='zh': editD = 100 - (TMUtilsMatching._edit_distance(q_text, s_text)) #* 100 else: # Normal editDistance, without puntuation marks and only word, without stop words nchar_diff = TMUtilsMatching._edit_distance(' '.join(q_onlyW), ' '.join(s_onlyW)) # Consider all the words, without any substitution #print(q_onlyW) #print(s_onlyW) nchar_len = len(' '.join(q_onlyW)) + len(' '.join(s_onlyW)) if nchar_len == 0: nchar_len = 1 #print(nchar_len) char_diff = (2*nchar_diff)/(nchar_len) # total of charaters # 2) ********* Simplified --> Convert to letter and keep only puntuation marks q_replaceW, q_onlyS = TMMatching._symbol_sequence(q_simplified) # Original query # Ex. '- 3.67 housing units constructed under the $ # home % ownership saving scheme in the Hanano/ and (Hamdaniya districts;' --> - N N N N N N $ # N % N N N N N N/ N (N N; s_replaceW, s_onlyS = TMMatching._symbol_sequence(s_simplified) # Original tm_src if (len(s_onlyS) == 0 and len(q_onlyS) == 0): # There are not symbol n_symbol_diff = 0 else: n_symbol_diff = TMUtilsMatching._edit_distance(q_replaceW, s_replaceW) #(' '.join(q_onlyS), ' '.join(s_onlyS))/2# len_symbols = len(q_replaceW.split(' ')) + len(q_replaceW.split(' ')) # len(q_onlyS) + len(s_onlyS) if len_symbols == 0: len_symbols = 1 symbol_diff = (2*n_symbol_diff)/len_symbols # 3) ********* Exist or not exist the query words on source nword_diff = set(q_onlyW).difference(s_onlyW) # Replace regular expression by only one word onlyW_len = len(q_onlyW) if onlyW_len == 0: onlyW_len = 1 word_diff = (len(nword_diff))/onlyW_len # only query words # 4) ********* Stop words stop_words = True if (len(q_st_word) == 0 and len(s_st_word) == 0): # There are not stop word or this language doesn't have stop words list stop_words = False if stop_words: n_st_diff = TMUtilsMatching._edit_distance(' '.join(q_st_word), ' '.join(s_st_word)) len_stop_word = len(' '.join(q_st_word)) + len(' '.join(s_st_word)) stop_word_diff = (2 * n_st_diff)/len_stop_word editD = (1 - ((0.70 * (char_diff)) + (0.10 * (word_diff)) + (0.10 * (symbol_diff)) + (0.10 * (stop_word_diff)))) * 100 else: editD = (1 - ((0.70 * (char_diff)) + (0.15 * (word_diff)) + (0.15 * (symbol_diff)))) * 100 if editD < 0: editD = 10 return int(math.floor(editD))
# Tokenizer tok = TMTokenizer(lang.upper()).tokenizer pos = TMPosTagger(lang.upper()) for eline in file.readlines(): tok_sentences = tok.process(eline) print(tok_sentences) pos_sentence = [ element for word, element in pos.tag_segments([tok_sentences])[0] ] # Split several steps list_sentences = TMUtilsMatching.pre_process( tok_sentences, args.source, 'split_sentences', {}) #print('+++++++++++++++++') #print(list_sentences) list_word_pos = [] if list_sentences: i = 0 for each_sent in list_sentences: # Create word_pos len_e = len(each_sent.split()) list_word_pos.append([ (w, p) for w, p in zip(each_sent.split(), pos_sentence[i:i + len_e]) ]) i = i + len_e
def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal): logging.info("Applied match PIPE") tgt_text = segment.target_text src_text = segment.source_text status = '' editD = ini_editD status_tokenizer = False if equal: if self.query == src_text: return segment, editD, 'find', equal, status_tokenizer else: equal = False if not equal: for op in self.pipe: #Indicate by parameters if op == 'regex': if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re']) logging.info("Applied Regex") self.timer.start("_regx_match") # ************************** Compare query_re with src_re --> simplified match = ini_editD if src_re != src_text: if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower(): # With simplified regular expression and in lowercase match = 100 # Perfect match tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text) ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) if match == 100: status = 'find' self.timer.stop("_regx_match") if op == 'tags': logging.info("Delete Tags") self.timer.start("_tags_match") src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD) logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("_tags_match") if op == 'posTag': self.timer.start("fuzzy_match") upper = False if segment.source_pos is not None and segment.target_pos is not None: # This part need the pos tagger annotation squery, tok_query, pos_query = self.check_query_parameters() logging.info("Apply posTag matching") self.timer.start("fuzzy_preprocess") if status_tokenizer == False: # Tokenize source and target tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {}) # Tokenize the simplified query status_tokenizer = True if 'universal' not in self.query_dic: self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang) #print(self.query_dic['universal']) src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang) # [word, pos] tm_src segment tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang) # [word, pos] tm_tgt segment self.timer.stop("fuzzy_preprocess") if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list): logging.info("Check unmatch word --> PosTag") if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text): logging.info("Query and source have same length or only one difference") self.timer.start("search unmatch") tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features) self.timer.stop("search unmatch") logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD))) self.timer.start("create target unmatch") if src_un_match is not None: # Create new src src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag) # Improve edit distance src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) src_re_reduce = TMRegexMatch.simplified_name(src_re) penalize_match = self._improve_match(src_un_match, operation) ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match # match # Create new tgt if tgt_un_match is not None: tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag) # tgt_word, self.timer.stop("create target unmatch") logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("fuzzy_match") # Check if find or break some transformation if ini_editD > editD: editD = ini_editD if status == 'find' or status == 'break': segment.source_text = src_text segment.target_text = tgt_text return segment, editD, status, equal, status_tokenizer if editD >= self.min_match: segment.source_text = src_text segment.target_text = tgt_text status = 'find' else: #Call split rules if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language and self.query_dic['pos'] src_text = None tgt_text = None editSplit = 0 # Split by sentences. list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {}) logging.info("split by Sentences : {} ".format(list_sentences)) # Check sentence first if len(list_sentences) > 1: split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() #print('*****Only sentences *****') #print(src_text) #print(tgt_text) #print(editSplit) if editSplit >= self.min_match: # Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit else: # Split in small phrase # Check if exist split for an especific pairs of languages lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang) if lang_class: logging.info("Split Query by Phrase") all_split, all_marks = self._splitByPhrase(lang_class, list_sentences) # Check if any split rule was applied if len(all_split) > 1: # print(list_query_split) split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() if editSplit >= self.min_match: #Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit if editD >= self.min_match: status = 'find' status_tokenizer = True else: if not self.trans_segments: #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return [] #logging.info("Prepare Automatic Translation : ") self.trans_segments.append((segment, editD)) status = 'break' # If exist segment on the list, break the for and there aren't translation return segment, editD, status, equal, status_tokenizer