예제 #1
0
 def _regx_match(self, query_dic, src_text, tgt_text):  #, src_pos, tgt_pos
     logging.info("Applied Regex")
     fuzzy_alg = TMRegexMatch(
         self.src_lang, self.tgt_lang
     )  # Class to improve fuzzy match #, self.best_segments
     return fuzzy_alg.process(query_dic, src_text,
                              tgt_text)  #, src_pos, tgt_pos
예제 #2
0
  def token_count(self, text, lang):

    lang = lang.split('-')[0].upper()
    if not lang in self.regex:
      try:
        self.regex[lang] = TMRegExpPreprocessor(lang)
        logging.info("Loading Regex for {}".format(lang))
      except Exception as e:
        logging.info("Unsupported Regex for {} ".format(lang))
        self.regex[lang] = lang
    if not lang in self.tokenizers:
        try:
          self.tokenizers[lang] = TMTokenizer(lang)
          logging.info("Loading Tokenizer for {}".format(lang))
        except Exception as e:
          self.tokenizers[lang] = lang
          logging.info("Unsupported Tokenizer for {}".format(lang))

    if self.regex[lang] != lang: text = TMRegexMatch.simplified_name(self.regex[lang].process(text))
    if self.tokenizers[lang] != lang: token_cnt = len((self.tokenizers[lang].tokenizer.process(text)).split(' '))
    else:
      if ' ' in text: token_cnt = len(text.split(' '))
      else: token_cnt = 1

    return token_cnt#len((self.tokenizers[lang].tokenizer.process(TMRegexMatch.simplified_name(self.regex[lang].process(text)))).split(' '))
예제 #3
0
    def _preprocess(self, text, lang):

        dic_query = {}
        s_tags = XmlUtils.extract_tags(text)
        if not s_tags:
            dic_query['query'] = text
        else:
            dic_query['query'] = XmlUtils.strip_tags(
                text)  # split tag to do the match

        dic_query['tokenizer'] = TMUtilsMatching.pre_process(
            dic_query['query'], self.src_lang, 'tokenizer', {})
        dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'],
                                                       lang, 'pos_tagger', {})
        dic_query['universal'] = TMUtilsMatching.segment_2_universal(
            dic_query['tokenizer'].lower(), dic_query['pos'],
            lang)  # universal_text[0]
        dic_query['universal'] = dic_query['pos']

        regex_class = TMRegexMatch(
            self.src_lang, self.tgt_lang)  # Class to improve fuzzy match
        dic_query['query_re'] = TMUtilsMatching.pre_process(
            dic_query['tokenizer'], self.src_lang, 'reg_exp',
            regex_class.re_pp)
        return dic_query
예제 #4
0
  def _preprocess(self):
    self.query_dic['query'] = self.query

    if re.search("<.*>", self.query):  # Uniform tags --> # Yo tengo un <b>gato</b>. --> Yo tengo un <T1>gato</T1>
      self.query_dic['query_tags'] = TMUtilsMatching.pre_process(self.query, (self.src_lang, self.tgt_lang), 'tags', {})

      self.query_dic['query'] = self.query_dic['query_tags'] # query now have the tags <T1>gato</T1>

    if 'regex' in self.pipe: self.query_dic['query_re'] = TMUtilsMatching.pre_process(self.query_dic['query'], self.src_lang, 'reg_exp', self.match['regex'].re_pp)
    else: self.query_dic['query_re'] = self.query_dic['query']
    self.query_dic['query_re_reduce'] = TMRegexMatch.simplified_name(self.query_dic['query_re'])

    return self.query_dic
예제 #5
0
  def _validate_pipe(self, pipe):
    match_process = {
      'regex': None,
      'posTag': None,
      'tags': TMTags()
    }

    try:
      match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang)
      logging.info("Loading regex for matching")
    except ValueError:
      if 'regex' in pipe:
        pipe.pop(pipe.index('regex'))
        logging.info("Unsupported regex for matching")

    query_out_tags = XmlUtils.replace_tags(self.query)

    try:
      if 'tokenizer' not in self.query_dic:
        self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
      logging.info("Loading Tokenizer for {}".format(self.src_lang))

      try:
        if 'pos' not in self.query_dic:
          self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
        match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang)
        logging.info("Loading regex for matching")
      except Exception as e:
        if 'posTag' in pipe:
          pipe.pop(pipe.index('posTag'))
          logging.info("Unsupported posTag for matching")
    except Exception as e:
      if 'posTag' in pipe:
        pipe.pop(pipe.index('posTag'))
        logging.info("Unsupported Tokenizer for {}".format(self.src_lang))

    return match_process, pipe
예제 #6
0
  def _match_rank(self, best_segments):
    self.timer.start("rank segments")
    editD_score = []
    if 'query_tags' in self.query_dic: # Simplified tags
      query = TMUtilsMatching.reduce_tags(self.query_dic['query_tags']) # Yo tengo un <T1>gato</T1>. Yo tengo un T gato T.
    else:
      query = self.query_dic['query']

    for i in range(0, len(best_segments)):
      segment = best_segments[i]
      # Simplified tags in tm source
      if re.search("</?T[0-9]*/?>", segment[0].source_text):
        src_text = TMUtilsMatching.reduce_tags(segment[0].source_text) # Simplified tags in tm source and target
      else: src_text = segment[0].source_text
      # Applied Regex and simplified
      if 'regex' in self.pipe: src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
      else: src_re = src_text


      src_re_reduce = TMRegexMatch.simplified_name(src_re)
      best_segments[i] = (segment[0], segment[1], src_re, src_re_reduce)
      editD_score.append(self._tm_edit_distance(query, src_text, self.query_dic['query_re_reduce'], src_re_reduce))  # EditD with tags simplied TMUtilsMatching._edit_distance(query, src_text)
    self.timer.stop("rank segments")
    return sorted(zip(best_segments, editD_score), key=operator.itemgetter(1), reverse=True)
예제 #7
0
  def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal):
    logging.info("Applied match PIPE")
    tgt_text = segment.target_text
    src_text = segment.source_text
    status = ''

    editD = ini_editD
    status_tokenizer = False
    if equal:
      if self.query == src_text:
        return segment, editD, 'find', equal, status_tokenizer
      else:
        equal = False
    if not equal:
      for op in self.pipe: #Indicate by parameters
        if op == 'regex':
          if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex   #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re'])
              logging.info("Applied Regex")
              self.timer.start("_regx_match")
              # ************************** Compare query_re with src_re --> simplified
              match = ini_editD
              if src_re != src_text:
                if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower():  # With simplified regular expression and in lowercase
                  match = 100  # Perfect match
                tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text)
                ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match
                logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
              if match == 100:
                status = 'find'
              self.timer.stop("_regx_match")
        if op == 'tags':
          logging.info("Delete Tags")
          self.timer.start("_tags_match")
          src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD)
          logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("_tags_match")

        if op == 'posTag':
          self.timer.start("fuzzy_match")
          upper = False
          if segment.source_pos is not None and segment.target_pos is not None:  # This part need the pos tagger annotation
            squery, tok_query, pos_query = self.check_query_parameters()
            logging.info("Apply posTag matching")
            self.timer.start("fuzzy_preprocess")
            if status_tokenizer == False:  # Tokenize source and target
              tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {})  # Pre-process tgt
              src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {})  # Tokenize tm_src
              self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {})  # Tokenize the simplified query
              status_tokenizer = True

            if 'universal' not in self.query_dic:
              self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang)
            #print(self.query_dic['universal'])
            src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang)  # [word, pos] tm_src segment
            tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang)  # [word, pos] tm_tgt segment

            self.timer.stop("fuzzy_preprocess")
            if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list):

              logging.info("Check unmatch word --> PosTag")
              if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text):
                logging.info("Query and source have same length or only one difference")

                self.timer.start("search unmatch")
                tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features)
                self.timer.stop("search unmatch")
                logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD)))
                self.timer.start("create target unmatch")

                if src_un_match is not None:
                  # Create new src
                  src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag)
                  # Improve edit distance
                  src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
                  src_re_reduce = TMRegexMatch.simplified_name(src_re)
                  penalize_match = self._improve_match(src_un_match, operation)
                  ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match  # match
                  # Create new tgt
                if tgt_un_match is not None:
                  tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag)  # tgt_word,
                self.timer.stop("create target unmatch")
                logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("fuzzy_match")

        # Check if find or break some transformation
        if ini_editD > editD:
          editD = ini_editD
        if status == 'find' or status == 'break':
          segment.source_text = src_text
          segment.target_text = tgt_text
          return segment, editD, status, equal, status_tokenizer
      if editD >= self.min_match:
        segment.source_text = src_text
        segment.target_text = tgt_text
        status = 'find'
      else:
        #Call split rules
        if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language  and self.query_dic['pos']

          src_text = None
          tgt_text = None
          editSplit = 0

          # Split by sentences.
          list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {})
          logging.info("split by Sentences : {} ".format(list_sentences))

          # Check sentence first
          if len(list_sentences) > 1:

            split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain)
            src_text, tgt_text, editSplit = split_match._match()
            #print('*****Only sentences *****')
            #print(src_text)
            #print(tgt_text)
            #print(editSplit)

          if editSplit >= self.min_match:  # Check if split method return segments from ActivaTM
            segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

          else: # Split in small phrase
            # Check if exist split for an especific pairs of languages
            lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang)

            if lang_class:
              logging.info("Split Query by Phrase")
              all_split, all_marks = self._splitByPhrase(lang_class, list_sentences)

              # Check if any split rule was applied
              if len(all_split) > 1:
                  # print(list_query_split)
                split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain)
                src_text, tgt_text, editSplit = split_match._match()

                if editSplit >= self.min_match: #Check if split method return segments from ActivaTM
                  segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

        if editD >= self.min_match:
          status = 'find'
          status_tokenizer = True
        else:
          if not self.trans_segments:  #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return []
            #logging.info("Prepare Automatic Translation : ")
            self.trans_segments.append((segment, editD))
          status = 'break' # If exist segment on the list, break the for and there aren't translation
    return segment, editD, status, equal, status_tokenizer