예제 #1
0
    def tags2indexes(self, s_tags, s_pos_with_tags, s_pos, t_pos):
        print(
            "tags2indexes: S_TAGS: {}, s_pos_with_tags: {}, S_POS: {}, T_POS: {}"
            .format(s_tags, s_pos_with_tags, s_pos, t_pos))

        tag2index = dict()
        # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags
        for tag in s_tags:
            tag_name = self.tag2name(tag)
            # Self-closing tags should be handled with separate model
            if XmlUtils.is_self_closing_tag(tag):
                print("Self-closing tag: {}".format(tag))
                s_iob = self.tag2iob_self_closing(s_pos_with_tags, tag)
                # TODO:
                t_iob = self.predict(s_iob, s_pos, t_pos)
                start_index, end_index = self.iob2indexes(t_iob,
                                                          self_closing=True)
                tag2index['<{}/>'.format(tag_name)] = start_index
            elif XmlUtils.is_opening_tag(tag):
                print("Opening tag: {}".format(tag))
                s_iob = self.tag2iob(s_pos_with_tags, tag)
                t_iob = self.predict(s_iob, s_pos, t_pos)
                start_index, end_index = self.iob2indexes(t_iob)
                # Store mapping
                tag2index['<{}>'.format(tag_name)] = start_index
                tag2index['</{}>'.format(tag_name)] = end_index
            else:
                # closing tag or  don't do anything
                pass
        return tag2index
예제 #2
0
    def _preprocess(self, text, lang):

        dic_query = {}
        s_tags = XmlUtils.extract_tags(text)
        if not s_tags:
            dic_query['query'] = text
        else:
            dic_query['query'] = XmlUtils.strip_tags(
                text)  # split tag to do the match

        dic_query['tokenizer'] = TMUtilsMatching.pre_process(
            dic_query['query'], self.src_lang, 'tokenizer', {})
        dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'],
                                                       lang, 'pos_tagger', {})
        dic_query['universal'] = TMUtilsMatching.segment_2_universal(
            dic_query['tokenizer'].lower(), dic_query['pos'],
            lang)  # universal_text[0]
        dic_query['universal'] = dic_query['pos']

        regex_class = TMRegexMatch(
            self.src_lang, self.tgt_lang)  # Class to improve fuzzy match
        dic_query['query_re'] = TMUtilsMatching.pre_process(
            dic_query['tokenizer'], self.src_lang, 'reg_exp',
            regex_class.re_pp)
        return dic_query
예제 #3
0
파일: TMDbApi.py 프로젝트: MittagQI/nectm
 def machine_translate(self, tm_engine, source_lang, target_lang,
                       in_segments, min_match):
     mt_texts = []
     mt_flags = []
     # Build list of texts to machine translate
     for query, (segments, match_check) in in_segments:
         mt_flags.append(match_check)
         if not match_check:
             mt_texts.append(XmlUtils.strip_tags(query))
     # No text suitable for MT - return input segments (False = Non-MT)
     if not mt_texts:
         return [(segments, False)
                 for query, (segments, match_check) in in_segments]
     # Actual MT translation
     translated_texts = tm_engine.translate(mt_texts)
     # Fill output by either machine translation or segment
     out_segments = []
     for ttext, (query, (segments,
                         match_check)) in zip(translated_texts,
                                              in_segments):
         if not segments:
             out_segments_per_q = []
         elif not match_check:
             out_segments_per_q = (
                 [(self._prepare_target_text(query, segments[0][0], ttext,
                                             source_lang, target_lang),
                   min_match)] if translated_texts else [], True
             )  # True = MT
         else:
             out_segments_per_q = (segments, False)  # False = not MT
         out_segments.append(out_segments_per_q)
     return out_segments
예제 #4
0
 def process(self, text):
     # **********load tokenizer according to the language
     nltk_model = self.models.get(self.language).split('/')[2].split('.')[0]
     text = ' '.join(self.tokenizer.word_tokenize(text, nltk_model))
     if re.search(TOK_PATTERN, text):  # Check if the text have tags
         text = XmlUtils.join_tags(text, JOIN_PATTERN)
     return text
예제 #5
0
 def check_query_parameters(self):
   if 'pos' not in self.query_dic:  # Applied pos and universal on query --> only the firt time
     if 'tokenizer' not in self.query_dic:  # The first transformation is posTag --> any other were applied
       query_out_tags = XmlUtils.replace_tags(self.query_dic['query'])
       self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
     self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
   return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
예제 #6
0
    def __call__(self, index, segments_iter):
        # Import should be inside the function to avoid serializing all pos tagger dependencies
        # for parallel execution
        sys.path.append(
            os.path.join(os.path.abspath(os.path.dirname(__file__)), '..',
                         '..'))
        sys.path = [p for p in sys.path if p]
        from TMPosTagger.TMPosTagger import TMPosTagger

        # Cache all segments. Though it might be expensive in terms of memory, but we need
        # to gather all texts for POS tagger batch and then store back
        # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently
        # large number of Spark jobs
        segments = [s for s in segments_iter]
        # Initialize PosTaggers for source and target languages
        pos_taggers = [
            TMPosTagger(lang.split('-')[0], universal=self.is_universal)
            for lang in self.langs
        ]
        # Invoke POS taggers for source and target segments
        src_texts = pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s.source_text) for s in segments])
        tgt_texts = pos_taggers[1].tag_segments(
            [XmlUtils.replace_tags(s.target_text) for s in segments])
        # Store POS tags with XML tags as a training data. TODO: make it optional
        f = open(
            tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        iobs = open(
            tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        for s, stext, ttext in zip(segments, src_texts, tgt_texts):
            s.source_pos = self.tags2string(stext)
            s.target_pos = self.tags2string(ttext)
            # Write POS tags (+XML tags) to text file to be used as a training data
            if re.match(XmlUtils.TAG_PATTERN, s.source_text):
                f.write("{}\n{}\n\n".format(
                    self.tags2string_xml_tags(s.source_text, stext),
                    self.tags2string_xml_tags(s.target_text, ttext)))
                for s, t in zip(
                        self.tags2string_iob_tags(s.source_text, stext),
                        self.tags2string_iob_tags(s.target_text, ttext)):
                    iobs.write("{}\n{}\n\n".format(s, t))

        f.close()
        iobs.close()
        return segments
예제 #7
0
파일: TMDbApi.py 프로젝트: MittagQI/nectm
    def query(self, qparams):
        # Drop tags from query
        q_out_tags = [(q, XmlUtils.strip_tags(q)) for q in qparams.qlist]
        if not qparams.qinfo:
            qparams.qinfo = [dict() for q in qparams.qlist]

        out_segments = []  # list of lists of tuples :(segment, ter)
        if qparams.concordance:
            dic_filter = [{'target_language': qparams.target_lang}]
        else:
            # Extract query length
            dic_filter = self._filter_by_query(
                q_out_tags, qparams.source_lang, qparams.target_lang, '-',
                qparams.exact_length
            )  # Doesn't pass the total token, the function calculate the value for each query  -->  target_lang
        if qparams.aut_trans: list_to_translate = []
        # Query source ES for the text
        self.timer.start("monoling_query")

        for q, qinfo, response in zip(
                qparams.qlist, qparams.qinfo,
                self.ml_index.mquery(qparams.source_lang,
                                     qparams.limit,
                                     [q_o_tags for q, q_o_tags in q_out_tags],
                                     filter=[f for f in dic_filter])):
            self.timer.stop("monoling_query")
            out_segments.append(
                (q, self._query(q, qinfo, response,
                                qparams)))  # create new list for current query

        if qparams.aut_trans:
            logging.info("Machine Translation")
            last_output = []
            if not out_segments:
                for query in qparams.qlist:
                    segment = TMTranslationUnit()
                    segment.source_text = query
                    out_segments += [(query, ([(segment, 0)], False))]
            tm_engine = TMAutomaticTranslation.get_engine(
                qparams.source_lang, qparams.target_lang, qparams.domains)
            for i in range(0, len(out_segments), self.TRANSLATE_BATCH_SIZE):
                #for each_query in self.execute_machine_translation(tm_engine, qparams.source_lang, qparams.target_lang, out_segments[i:i + self.TRANSLATE_BATCH_SIZE], qparams.min_match):
                for each_query in self.machine_translate(
                        tm_engine, qparams.source_lang, qparams.target_lang,
                        out_segments[i:i + self.TRANSLATE_BATCH_SIZE],
                        qparams.min_match):
                    last_output.append(each_query)
        else:
            last_output = [(segments, False)
                           for query, (segments, match_check) in out_segments]
        self.timer.stop("match_time_query")
        return last_output
예제 #8
0
    def process(self, text):
        #Probably if good transform the input text in ' ' + text + '\n'

        tokenizer = subprocess.Popen(self.args,
                                     stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE)
        tok_sents, tok_exc = tokenizer.communicate(input=text.encode('utf8'))
        tokenizer.wait()
        text = (tok_sents.decode("utf-8")).strip('\n')

        if re.search(TOK_PATTERN, text):  # Check if the text have tags
            text = XmlUtils.join_tags(text, JOIN_PATTERN)
        return text
예제 #9
0
    def process(self, text):
        # Check if there any tags at all
        if not re.search("<.*>", text): return text
        # Keep original text and its stripped version
        org_text = text
        text, stext = XmlUtils.fix_tags(text)
        try:
            #print("ORG TEXT: {}, PARSING: {}".format(org_text, text))
            text = XmlUtils.rename_tags(text)
            for e in self.parser.error_log:
                # Check for certain errors which might create problems in TM and therefore remove all tags at once
                if e.type_name == 'ERR_TAG_NAME_MISMATCH' or e.type_name == 'ERR_TAG_NOT_FINISHED':
                    logging.warning(
                        "Failed to parse segment text into XML: '{}' reason: {}. Removing tags instead"
                        .format(org_text, e))
                    return stext

        except Exception as ex:
            logging.warning(
                "Failed to rename tags in {}, reason: {}. Removing tags instead: {}"
                .format(org_text, ex, stext))
            return stext
        return text
예제 #10
0
    def tags2string_xml_tags(self, text, text_pos):
        pos_str = self.tags2string(text_pos)
        # If no XML tags found, just return concatenated POS tags
        tags = XmlUtils.extract_tags(text)
        if not tags: return pos_str
        pos = []

        for word_pos in text_pos:
            # Contatenate POS tags and XML tags into the string
            if word_pos[0] == XmlUtils.TAG_PLACEHOLDER:
                pos.append(tags.pop(0))
            elif len(word_pos) < 2:
                continue
            else:
                pos.append(word_pos[1])

        return " ".join(pos)
예제 #11
0
    def __call__(self, s_txt, t_txt):
        s_tags = XmlUtils.extract_tags(s_txt)
        if not s_tags: return t_txt

        t_tags = XmlUtils.extract_tags(t_txt)
        # Number of tags is equal - just replace one by one
        if len(s_tags) == len(t_tags):
            for s_tag, t_tag in zip(s_tags, t_tags):
                t_txt = t_txt.replace(t_tag, s_tag, 1)
            return t_txt
        else:
            s_toks = TMTextProcessors.tokenizer(
                self.langs[0]).tokenizer.process(
                    XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0],
                                          adjacent_space_placeholder=XmlUtils.
                                          SPACE_PLACEHOLDER)).split()
            # TODO: s_universal = self._preprocess(s_toks, self.langs[0])
            # Strip all tags from target text before tokenizing it
            t_toks = TMTextProcessors.tokenizer(
                self.langs[1]).tokenizer.process(
                    XmlUtils.strip_tags(t_txt)).split()
            #TODO: t_universal = self._preprocess(t_toks, self.langs[1])
            t_toks_new = []
            # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token).
            # Insert source tags at the aligned places in the target text
            ti = 0
            for si in range(0, len(s_toks)):
                count = 1  # init
                if s_toks[si] == XmlUtils.TAG_PLACEHOLDER:
                    t_toks_new.append(s_tags.pop(0))
                elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER:
                    t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER)
                elif ti < len(t_toks):
                    t_toks_new.append(t_toks[ti])
                    ti += 1
                else:
                    break  # source is longer than target, stop here
            # Append remaining target tokens
            if ti < len(t_toks): t_toks_new += t_toks[ti:]
            # If not all tags have been aligned, just contatenate remaining ones to the end
            if s_tags: t_toks_new += s_tags
        # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent)
        # return self.tok[1].join(t_toks_new)
        ttext_with_tags = XmlUtils.join_tags(
            ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)'
        )  # --> join words with tags <b> this </b> --> <b>this</b>
        # Handle whitespaces which are adjacent to tags
        ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags)
        ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags)
        ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '',
                                 ttext_with_tags)
        return ttext_with_tags
예제 #12
0
    def tags2string_iob_tags(self, text, text_pos):
        pos_str = self.tags2string(text_pos)
        # If no XML tags found, just return concatenated POS tags
        tags = XmlUtils.extract_tags(text)
        if not tags: return pos_str
        pos = []

        for word_pos in text_pos:
            # Contatenate POS tags and XML tags into the string
            if word_pos[0] == XmlUtils.TAG_PLACEHOLDER:
                pos.append(tags.pop(0))
            elif len(word_pos) < 2:
                continue
            else:
                pos.append(word_pos[1])

        iobs = []
        for w in pos:
            if self.is_self_closing_tag(w):
                iob = self.tag2iob(pos, w)
                if iob:
                    iobs.append(iob)
        return iobs
예제 #13
0
  def _validate_pipe(self, pipe):
    match_process = {
      'regex': None,
      'posTag': None,
      'tags': TMTags()
    }

    try:
      match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang)
      logging.info("Loading regex for matching")
    except ValueError:
      if 'regex' in pipe:
        pipe.pop(pipe.index('regex'))
        logging.info("Unsupported regex for matching")

    query_out_tags = XmlUtils.replace_tags(self.query)

    try:
      if 'tokenizer' not in self.query_dic:
        self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
      logging.info("Loading Tokenizer for {}".format(self.src_lang))

      try:
        if 'pos' not in self.query_dic:
          self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
        match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang)
        logging.info("Loading regex for matching")
      except Exception as e:
        if 'posTag' in pipe:
          pipe.pop(pipe.index('posTag'))
          logging.info("Unsupported posTag for matching")
    except Exception as e:
      if 'posTag' in pipe:
        pipe.pop(pipe.index('posTag'))
        logging.info("Unsupported Tokenizer for {}".format(self.src_lang))

    return match_process, pipe
예제 #14
0
    def __call__(self, s_txt, t_txt):
        # Extract source tags to be transferred: ['<X[1]>', '</X[1]>']
        print("Source text: {}".format(s_txt))
        s_tags = XmlUtils.extract_tags(s_txt)
        print("Source tags: {}".format(s_tags))
        if not s_tags: return t_txt
        # Remove any tags from the target
        t_txt = XmlUtils.strip_tags(t_txt)

        # Rename tags to avoid problems in XML parser
        # I have <X[1]>a dog</X[1]> ---> I have <T1>a dog</T1>
        s_txt_fixed = XmlUtils.simplify_tags(s_txt)
        s_tags_fixed = XmlUtils.extract_tags(s_txt_fixed)
        print("Fixed source tags: {}".format(s_tags_fixed))
        # Keep mapping of fixed tags to original tags for the final recovery:
        # tags_map = {'<T1>: '<X[1]>', '</T1>': '</X[1]>'}
        assert len(s_tags_fixed) == len(s_tags)
        tags_map = dict(zip(s_tags_fixed, s_tags))
        print("Tags map: {}".format(tags_map))

        # Run POS tagging (before, replace XML tags with a placeholder in the source text):
        # I chase <T1>a dog</T1> --> I chase ELASTICTMTAG a dog ELASTICTMTAG
        # --> I/NOUN have/VERB ELASTICTMTAG/NOUN a/DET dog/NOUN ELASTICTMTAG/NOUN
        s_pos = self.pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s_txt_fixed)])[0]
        t_pos = self.pos_taggers[1].tag_segments([t_txt])[0]
        # Recover fixed tags:
        # I,NOUN have,VERB ELASTICTMTAG,NOUN a,DET dog,NOUN ELASTICTMTAG,NOUN
        # ---> NOUN VERB <T1> DET NOUN </T1>
        s_pos_with_tags, s_pos = XmlUtils.recover_tags_pos(s_pos, s_tags_fixed)
        print("S_POS_WITH_TAGS: {}, S_POS: {}, T_POS: {}".format(
            s_pos_with_tags, s_pos, t_pos))
        # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags. Return value
        # is a map of tags to their correspondent indexes in target (tokenized) text
        tag2t_index = self.tags2indexes(s_tags_fixed, s_pos_with_tags, s_pos,
                                        [t[1] for t in t_pos])

        # Place tags at predicted indexes in the target text
        t_txt_with_tags = self.place_tags(s_tags_fixed, tag2t_index, tags_map,
                                          t_pos)
        if not t_txt_with_tags: return None
        # TODO: join using language-specific "joiner" (opposite of tokenizer)
        return " ".join(t_txt_with_tags)
예제 #15
0
 def reduce_tags(str_in):
     return XmlUtils.reduce_tags(str_in)
예제 #16
0
 def strip_tags(str_in):
     return re.sub("\s\s+", " ", XmlUtils.strip_tags(str_in))
예제 #17
0
 def process(self, sentences):
     text = self.tm_tokenize.segment(sentences).strip('\n')
     if re.search(TOK_PATTERN, text):  # Check if the text have tags
         text = XmlUtils.join_tags(text, JOIN_PATTERN)
     return text
예제 #18
0
 def process(self, text):
     text = ' '.join(self.tokenizer.wordpunct_tokenize(text))
     if re.search(TOK_PATTERN, text):  # Check if the text have tags
         text = XmlUtils.join_tags(text, JOIN_PATTERN)
     return text