def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal): logging.info("Applied match PIPE") tgt_text = segment.target_text src_text = segment.source_text status = '' editD = ini_editD status_tokenizer = False if equal: if self.query == src_text: return segment, editD, 'find', equal, status_tokenizer else: equal = False if not equal: for op in self.pipe: #Indicate by parameters if op == 'regex': if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re']) logging.info("Applied Regex") self.timer.start("_regx_match") # ************************** Compare query_re with src_re --> simplified match = ini_editD if src_re != src_text: if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower(): # With simplified regular expression and in lowercase match = 100 # Perfect match tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text) ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) if match == 100: status = 'find' self.timer.stop("_regx_match") if op == 'tags': logging.info("Delete Tags") self.timer.start("_tags_match") src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD) logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("_tags_match") if op == 'posTag': self.timer.start("fuzzy_match") upper = False if segment.source_pos is not None and segment.target_pos is not None: # This part need the pos tagger annotation squery, tok_query, pos_query = self.check_query_parameters() logging.info("Apply posTag matching") self.timer.start("fuzzy_preprocess") if status_tokenizer == False: # Tokenize source and target tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {}) # Tokenize the simplified query status_tokenizer = True if 'universal' not in self.query_dic: self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang) #print(self.query_dic['universal']) src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang) # [word, pos] tm_src segment tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang) # [word, pos] tm_tgt segment self.timer.stop("fuzzy_preprocess") if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list): logging.info("Check unmatch word --> PosTag") if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text): logging.info("Query and source have same length or only one difference") self.timer.start("search unmatch") tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features) self.timer.stop("search unmatch") logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD))) self.timer.start("create target unmatch") if src_un_match is not None: # Create new src src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag) # Improve edit distance src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) src_re_reduce = TMRegexMatch.simplified_name(src_re) penalize_match = self._improve_match(src_un_match, operation) ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match # match # Create new tgt if tgt_un_match is not None: tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag) # tgt_word, self.timer.stop("create target unmatch") logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("fuzzy_match") # Check if find or break some transformation if ini_editD > editD: editD = ini_editD if status == 'find' or status == 'break': segment.source_text = src_text segment.target_text = tgt_text return segment, editD, status, equal, status_tokenizer if editD >= self.min_match: segment.source_text = src_text segment.target_text = tgt_text status = 'find' else: #Call split rules if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language and self.query_dic['pos'] src_text = None tgt_text = None editSplit = 0 # Split by sentences. list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {}) logging.info("split by Sentences : {} ".format(list_sentences)) # Check sentence first if len(list_sentences) > 1: split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() #print('*****Only sentences *****') #print(src_text) #print(tgt_text) #print(editSplit) if editSplit >= self.min_match: # Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit else: # Split in small phrase # Check if exist split for an especific pairs of languages lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang) if lang_class: logging.info("Split Query by Phrase") all_split, all_marks = self._splitByPhrase(lang_class, list_sentences) # Check if any split rule was applied if len(all_split) > 1: # print(list_query_split) split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() if editSplit >= self.min_match: #Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit if editD >= self.min_match: status = 'find' status_tokenizer = True else: if not self.trans_segments: #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return [] #logging.info("Prepare Automatic Translation : ") self.trans_segments.append((segment, editD)) status = 'break' # If exist segment on the list, break the for and there aren't translation return segment, editD, status, equal, status_tokenizer
print(' '.join(map(str, split_source[j]))) #+ '\t' + print(' '.join(map(str, split_target[value_target[j]]))) print('\n') print('Generated segments ' + str(count)) if __name__ == "__main__": args = parse_args() lang = args.source file = codecs.open(args.file, 'r') lang_class = G_CONFIG.get_split_rules(args.source, args.target) #print(lang_class) if lang_class: #print('########Call Split########') #src_text, tgt_text, editSplit = split_sentences(lang_class) #Split splitTask = TMSplit(lang_class, args.source) # Tokenizer tok = TMTokenizer(lang.upper()).tokenizer pos = TMPosTagger(lang.upper()) for eline in file.readlines(): tok_sentences = tok.process(eline) print(tok_sentences)