def tags2indexes(self, s_tags, s_pos_with_tags, s_pos, t_pos): print( "tags2indexes: S_TAGS: {}, s_pos_with_tags: {}, S_POS: {}, T_POS: {}" .format(s_tags, s_pos_with_tags, s_pos, t_pos)) tag2index = dict() # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags for tag in s_tags: tag_name = self.tag2name(tag) # Self-closing tags should be handled with separate model if XmlUtils.is_self_closing_tag(tag): print("Self-closing tag: {}".format(tag)) s_iob = self.tag2iob_self_closing(s_pos_with_tags, tag) # TODO: t_iob = self.predict(s_iob, s_pos, t_pos) start_index, end_index = self.iob2indexes(t_iob, self_closing=True) tag2index['<{}/>'.format(tag_name)] = start_index elif XmlUtils.is_opening_tag(tag): print("Opening tag: {}".format(tag)) s_iob = self.tag2iob(s_pos_with_tags, tag) t_iob = self.predict(s_iob, s_pos, t_pos) start_index, end_index = self.iob2indexes(t_iob) # Store mapping tag2index['<{}>'.format(tag_name)] = start_index tag2index['</{}>'.format(tag_name)] = end_index else: # closing tag or don't do anything pass return tag2index
def _preprocess(self, text, lang): dic_query = {} s_tags = XmlUtils.extract_tags(text) if not s_tags: dic_query['query'] = text else: dic_query['query'] = XmlUtils.strip_tags( text) # split tag to do the match dic_query['tokenizer'] = TMUtilsMatching.pre_process( dic_query['query'], self.src_lang, 'tokenizer', {}) dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'], lang, 'pos_tagger', {}) dic_query['universal'] = TMUtilsMatching.segment_2_universal( dic_query['tokenizer'].lower(), dic_query['pos'], lang) # universal_text[0] dic_query['universal'] = dic_query['pos'] regex_class = TMRegexMatch( self.src_lang, self.tgt_lang) # Class to improve fuzzy match dic_query['query_re'] = TMUtilsMatching.pre_process( dic_query['tokenizer'], self.src_lang, 'reg_exp', regex_class.re_pp) return dic_query
def machine_translate(self, tm_engine, source_lang, target_lang, in_segments, min_match): mt_texts = [] mt_flags = [] # Build list of texts to machine translate for query, (segments, match_check) in in_segments: mt_flags.append(match_check) if not match_check: mt_texts.append(XmlUtils.strip_tags(query)) # No text suitable for MT - return input segments (False = Non-MT) if not mt_texts: return [(segments, False) for query, (segments, match_check) in in_segments] # Actual MT translation translated_texts = tm_engine.translate(mt_texts) # Fill output by either machine translation or segment out_segments = [] for ttext, (query, (segments, match_check)) in zip(translated_texts, in_segments): if not segments: out_segments_per_q = [] elif not match_check: out_segments_per_q = ( [(self._prepare_target_text(query, segments[0][0], ttext, source_lang, target_lang), min_match)] if translated_texts else [], True ) # True = MT else: out_segments_per_q = (segments, False) # False = not MT out_segments.append(out_segments_per_q) return out_segments
def process(self, text): # **********load tokenizer according to the language nltk_model = self.models.get(self.language).split('/')[2].split('.')[0] text = ' '.join(self.tokenizer.word_tokenize(text, nltk_model)) if re.search(TOK_PATTERN, text): # Check if the text have tags text = XmlUtils.join_tags(text, JOIN_PATTERN) return text
def check_query_parameters(self): if 'pos' not in self.query_dic: # Applied pos and universal on query --> only the firt time if 'tokenizer' not in self.query_dic: # The first transformation is posTag --> any other were applied query_out_tags = XmlUtils.replace_tags(self.query_dic['query']) self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
def __call__(self, index, segments_iter): # Import should be inside the function to avoid serializing all pos tagger dependencies # for parallel execution sys.path.append( os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..')) sys.path = [p for p in sys.path if p] from TMPosTagger.TMPosTagger import TMPosTagger # Cache all segments. Though it might be expensive in terms of memory, but we need # to gather all texts for POS tagger batch and then store back # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently # large number of Spark jobs segments = [s for s in segments_iter] # Initialize PosTaggers for source and target languages pos_taggers = [ TMPosTagger(lang.split('-')[0], universal=self.is_universal) for lang in self.langs ] # Invoke POS taggers for source and target segments src_texts = pos_taggers[0].tag_segments( [XmlUtils.replace_tags(s.source_text) for s in segments]) tgt_texts = pos_taggers[1].tag_segments( [XmlUtils.replace_tags(s.target_text) for s in segments]) # Store POS tags with XML tags as a training data. TODO: make it optional f = open( tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format( TMUtils.date2str(datetime.datetime.now()), index), 'w') iobs = open( tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format( TMUtils.date2str(datetime.datetime.now()), index), 'w') for s, stext, ttext in zip(segments, src_texts, tgt_texts): s.source_pos = self.tags2string(stext) s.target_pos = self.tags2string(ttext) # Write POS tags (+XML tags) to text file to be used as a training data if re.match(XmlUtils.TAG_PATTERN, s.source_text): f.write("{}\n{}\n\n".format( self.tags2string_xml_tags(s.source_text, stext), self.tags2string_xml_tags(s.target_text, ttext))) for s, t in zip( self.tags2string_iob_tags(s.source_text, stext), self.tags2string_iob_tags(s.target_text, ttext)): iobs.write("{}\n{}\n\n".format(s, t)) f.close() iobs.close() return segments
def query(self, qparams): # Drop tags from query q_out_tags = [(q, XmlUtils.strip_tags(q)) for q in qparams.qlist] if not qparams.qinfo: qparams.qinfo = [dict() for q in qparams.qlist] out_segments = [] # list of lists of tuples :(segment, ter) if qparams.concordance: dic_filter = [{'target_language': qparams.target_lang}] else: # Extract query length dic_filter = self._filter_by_query( q_out_tags, qparams.source_lang, qparams.target_lang, '-', qparams.exact_length ) # Doesn't pass the total token, the function calculate the value for each query --> target_lang if qparams.aut_trans: list_to_translate = [] # Query source ES for the text self.timer.start("monoling_query") for q, qinfo, response in zip( qparams.qlist, qparams.qinfo, self.ml_index.mquery(qparams.source_lang, qparams.limit, [q_o_tags for q, q_o_tags in q_out_tags], filter=[f for f in dic_filter])): self.timer.stop("monoling_query") out_segments.append( (q, self._query(q, qinfo, response, qparams))) # create new list for current query if qparams.aut_trans: logging.info("Machine Translation") last_output = [] if not out_segments: for query in qparams.qlist: segment = TMTranslationUnit() segment.source_text = query out_segments += [(query, ([(segment, 0)], False))] tm_engine = TMAutomaticTranslation.get_engine( qparams.source_lang, qparams.target_lang, qparams.domains) for i in range(0, len(out_segments), self.TRANSLATE_BATCH_SIZE): #for each_query in self.execute_machine_translation(tm_engine, qparams.source_lang, qparams.target_lang, out_segments[i:i + self.TRANSLATE_BATCH_SIZE], qparams.min_match): for each_query in self.machine_translate( tm_engine, qparams.source_lang, qparams.target_lang, out_segments[i:i + self.TRANSLATE_BATCH_SIZE], qparams.min_match): last_output.append(each_query) else: last_output = [(segments, False) for query, (segments, match_check) in out_segments] self.timer.stop("match_time_query") return last_output
def process(self, text): #Probably if good transform the input text in ' ' + text + '\n' tokenizer = subprocess.Popen(self.args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) tok_sents, tok_exc = tokenizer.communicate(input=text.encode('utf8')) tokenizer.wait() text = (tok_sents.decode("utf-8")).strip('\n') if re.search(TOK_PATTERN, text): # Check if the text have tags text = XmlUtils.join_tags(text, JOIN_PATTERN) return text
def process(self, text): # Check if there any tags at all if not re.search("<.*>", text): return text # Keep original text and its stripped version org_text = text text, stext = XmlUtils.fix_tags(text) try: #print("ORG TEXT: {}, PARSING: {}".format(org_text, text)) text = XmlUtils.rename_tags(text) for e in self.parser.error_log: # Check for certain errors which might create problems in TM and therefore remove all tags at once if e.type_name == 'ERR_TAG_NAME_MISMATCH' or e.type_name == 'ERR_TAG_NOT_FINISHED': logging.warning( "Failed to parse segment text into XML: '{}' reason: {}. Removing tags instead" .format(org_text, e)) return stext except Exception as ex: logging.warning( "Failed to rename tags in {}, reason: {}. Removing tags instead: {}" .format(org_text, ex, stext)) return stext return text
def tags2string_xml_tags(self, text, text_pos): pos_str = self.tags2string(text_pos) # If no XML tags found, just return concatenated POS tags tags = XmlUtils.extract_tags(text) if not tags: return pos_str pos = [] for word_pos in text_pos: # Contatenate POS tags and XML tags into the string if word_pos[0] == XmlUtils.TAG_PLACEHOLDER: pos.append(tags.pop(0)) elif len(word_pos) < 2: continue else: pos.append(word_pos[1]) return " ".join(pos)
def __call__(self, s_txt, t_txt): s_tags = XmlUtils.extract_tags(s_txt) if not s_tags: return t_txt t_tags = XmlUtils.extract_tags(t_txt) # Number of tags is equal - just replace one by one if len(s_tags) == len(t_tags): for s_tag, t_tag in zip(s_tags, t_tags): t_txt = t_txt.replace(t_tag, s_tag, 1) return t_txt else: s_toks = TMTextProcessors.tokenizer( self.langs[0]).tokenizer.process( XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0], adjacent_space_placeholder=XmlUtils. SPACE_PLACEHOLDER)).split() # TODO: s_universal = self._preprocess(s_toks, self.langs[0]) # Strip all tags from target text before tokenizing it t_toks = TMTextProcessors.tokenizer( self.langs[1]).tokenizer.process( XmlUtils.strip_tags(t_txt)).split() #TODO: t_universal = self._preprocess(t_toks, self.langs[1]) t_toks_new = [] # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token). # Insert source tags at the aligned places in the target text ti = 0 for si in range(0, len(s_toks)): count = 1 # init if s_toks[si] == XmlUtils.TAG_PLACEHOLDER: t_toks_new.append(s_tags.pop(0)) elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER: t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER) elif ti < len(t_toks): t_toks_new.append(t_toks[ti]) ti += 1 else: break # source is longer than target, stop here # Append remaining target tokens if ti < len(t_toks): t_toks_new += t_toks[ti:] # If not all tags have been aligned, just contatenate remaining ones to the end if s_tags: t_toks_new += s_tags # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent) # return self.tok[1].join(t_toks_new) ttext_with_tags = XmlUtils.join_tags( ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)' ) # --> join words with tags <b> this </b> --> <b>this</b> # Handle whitespaces which are adjacent to tags ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags) ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags) ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '', ttext_with_tags) return ttext_with_tags
def tags2string_iob_tags(self, text, text_pos): pos_str = self.tags2string(text_pos) # If no XML tags found, just return concatenated POS tags tags = XmlUtils.extract_tags(text) if not tags: return pos_str pos = [] for word_pos in text_pos: # Contatenate POS tags and XML tags into the string if word_pos[0] == XmlUtils.TAG_PLACEHOLDER: pos.append(tags.pop(0)) elif len(word_pos) < 2: continue else: pos.append(word_pos[1]) iobs = [] for w in pos: if self.is_self_closing_tag(w): iob = self.tag2iob(pos, w) if iob: iobs.append(iob) return iobs
def _validate_pipe(self, pipe): match_process = { 'regex': None, 'posTag': None, 'tags': TMTags() } try: match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except ValueError: if 'regex' in pipe: pipe.pop(pipe.index('regex')) logging.info("Unsupported regex for matching") query_out_tags = XmlUtils.replace_tags(self.query) try: if 'tokenizer' not in self.query_dic: self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {}) logging.info("Loading Tokenizer for {}".format(self.src_lang)) try: if 'pos' not in self.query_dic: self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {}) match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang) logging.info("Loading regex for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported posTag for matching") except Exception as e: if 'posTag' in pipe: pipe.pop(pipe.index('posTag')) logging.info("Unsupported Tokenizer for {}".format(self.src_lang)) return match_process, pipe
def __call__(self, s_txt, t_txt): # Extract source tags to be transferred: ['<X[1]>', '</X[1]>'] print("Source text: {}".format(s_txt)) s_tags = XmlUtils.extract_tags(s_txt) print("Source tags: {}".format(s_tags)) if not s_tags: return t_txt # Remove any tags from the target t_txt = XmlUtils.strip_tags(t_txt) # Rename tags to avoid problems in XML parser # I have <X[1]>a dog</X[1]> ---> I have <T1>a dog</T1> s_txt_fixed = XmlUtils.simplify_tags(s_txt) s_tags_fixed = XmlUtils.extract_tags(s_txt_fixed) print("Fixed source tags: {}".format(s_tags_fixed)) # Keep mapping of fixed tags to original tags for the final recovery: # tags_map = {'<T1>: '<X[1]>', '</T1>': '</X[1]>'} assert len(s_tags_fixed) == len(s_tags) tags_map = dict(zip(s_tags_fixed, s_tags)) print("Tags map: {}".format(tags_map)) # Run POS tagging (before, replace XML tags with a placeholder in the source text): # I chase <T1>a dog</T1> --> I chase ELASTICTMTAG a dog ELASTICTMTAG # --> I/NOUN have/VERB ELASTICTMTAG/NOUN a/DET dog/NOUN ELASTICTMTAG/NOUN s_pos = self.pos_taggers[0].tag_segments( [XmlUtils.replace_tags(s_txt_fixed)])[0] t_pos = self.pos_taggers[1].tag_segments([t_txt])[0] # Recover fixed tags: # I,NOUN have,VERB ELASTICTMTAG,NOUN a,DET dog,NOUN ELASTICTMTAG,NOUN # ---> NOUN VERB <T1> DET NOUN </T1> s_pos_with_tags, s_pos = XmlUtils.recover_tags_pos(s_pos, s_tags_fixed) print("S_POS_WITH_TAGS: {}, S_POS: {}, T_POS: {}".format( s_pos_with_tags, s_pos, t_pos)) # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags. Return value # is a map of tags to their correspondent indexes in target (tokenized) text tag2t_index = self.tags2indexes(s_tags_fixed, s_pos_with_tags, s_pos, [t[1] for t in t_pos]) # Place tags at predicted indexes in the target text t_txt_with_tags = self.place_tags(s_tags_fixed, tag2t_index, tags_map, t_pos) if not t_txt_with_tags: return None # TODO: join using language-specific "joiner" (opposite of tokenizer) return " ".join(t_txt_with_tags)
def reduce_tags(str_in): return XmlUtils.reduce_tags(str_in)
def strip_tags(str_in): return re.sub("\s\s+", " ", XmlUtils.strip_tags(str_in))
def process(self, sentences): text = self.tm_tokenize.segment(sentences).strip('\n') if re.search(TOK_PATTERN, text): # Check if the text have tags text = XmlUtils.join_tags(text, JOIN_PATTERN) return text
def process(self, text): text = ' '.join(self.tokenizer.wordpunct_tokenize(text)) if re.search(TOK_PATTERN, text): # Check if the text have tags text = XmlUtils.join_tags(text, JOIN_PATTERN) return text