예제 #1
0
  def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal):
    logging.info("Applied match PIPE")
    tgt_text = segment.target_text
    src_text = segment.source_text
    status = ''

    editD = ini_editD
    status_tokenizer = False
    if equal:
      if self.query == src_text:
        return segment, editD, 'find', equal, status_tokenizer
      else:
        equal = False
    if not equal:
      for op in self.pipe: #Indicate by parameters
        if op == 'regex':
          if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex   #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re'])
              logging.info("Applied Regex")
              self.timer.start("_regx_match")
              # ************************** Compare query_re with src_re --> simplified
              match = ini_editD
              if src_re != src_text:
                if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower():  # With simplified regular expression and in lowercase
                  match = 100  # Perfect match
                tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text)
                ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match
                logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
              if match == 100:
                status = 'find'
              self.timer.stop("_regx_match")
        if op == 'tags':
          logging.info("Delete Tags")
          self.timer.start("_tags_match")
          src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD)
          logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("_tags_match")

        if op == 'posTag':
          self.timer.start("fuzzy_match")
          upper = False
          if segment.source_pos is not None and segment.target_pos is not None:  # This part need the pos tagger annotation
            squery, tok_query, pos_query = self.check_query_parameters()
            logging.info("Apply posTag matching")
            self.timer.start("fuzzy_preprocess")
            if status_tokenizer == False:  # Tokenize source and target
              tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {})  # Pre-process tgt
              src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {})  # Tokenize tm_src
              self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {})  # Tokenize the simplified query
              status_tokenizer = True

            if 'universal' not in self.query_dic:
              self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang)
            #print(self.query_dic['universal'])
            src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang)  # [word, pos] tm_src segment
            tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang)  # [word, pos] tm_tgt segment

            self.timer.stop("fuzzy_preprocess")
            if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list):

              logging.info("Check unmatch word --> PosTag")
              if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text):
                logging.info("Query and source have same length or only one difference")

                self.timer.start("search unmatch")
                tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features)
                self.timer.stop("search unmatch")
                logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD)))
                self.timer.start("create target unmatch")

                if src_un_match is not None:
                  # Create new src
                  src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag)
                  # Improve edit distance
                  src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
                  src_re_reduce = TMRegexMatch.simplified_name(src_re)
                  penalize_match = self._improve_match(src_un_match, operation)
                  ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match  # match
                  # Create new tgt
                if tgt_un_match is not None:
                  tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag)  # tgt_word,
                self.timer.stop("create target unmatch")
                logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("fuzzy_match")

        # Check if find or break some transformation
        if ini_editD > editD:
          editD = ini_editD
        if status == 'find' or status == 'break':
          segment.source_text = src_text
          segment.target_text = tgt_text
          return segment, editD, status, equal, status_tokenizer
      if editD >= self.min_match:
        segment.source_text = src_text
        segment.target_text = tgt_text
        status = 'find'
      else:
        #Call split rules
        if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language  and self.query_dic['pos']

          src_text = None
          tgt_text = None
          editSplit = 0

          # Split by sentences.
          list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {})
          logging.info("split by Sentences : {} ".format(list_sentences))

          # Check sentence first
          if len(list_sentences) > 1:

            split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain)
            src_text, tgt_text, editSplit = split_match._match()
            #print('*****Only sentences *****')
            #print(src_text)
            #print(tgt_text)
            #print(editSplit)

          if editSplit >= self.min_match:  # Check if split method return segments from ActivaTM
            segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

          else: # Split in small phrase
            # Check if exist split for an especific pairs of languages
            lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang)

            if lang_class:
              logging.info("Split Query by Phrase")
              all_split, all_marks = self._splitByPhrase(lang_class, list_sentences)

              # Check if any split rule was applied
              if len(all_split) > 1:
                  # print(list_query_split)
                split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain)
                src_text, tgt_text, editSplit = split_match._match()

                if editSplit >= self.min_match: #Check if split method return segments from ActivaTM
                  segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

        if editD >= self.min_match:
          status = 'find'
          status_tokenizer = True
        else:
          if not self.trans_segments:  #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return []
            #logging.info("Prepare Automatic Translation : ")
            self.trans_segments.append((segment, editD))
          status = 'break' # If exist segment on the list, break the for and there aren't translation
    return segment, editD, status, equal, status_tokenizer
예제 #2
0
                print(' '.join(map(str, split_source[j])))
                #+ '\t' +
                print(' '.join(map(str, split_target[value_target[j]])))
                print('\n')
    print('Generated segments ' + str(count))


if __name__ == "__main__":

    args = parse_args()

    lang = args.source

    file = codecs.open(args.file, 'r')

    lang_class = G_CONFIG.get_split_rules(args.source, args.target)
    #print(lang_class)
    if lang_class:
        #print('########Call Split########')
        #src_text, tgt_text, editSplit = split_sentences(lang_class)

        #Split
        splitTask = TMSplit(lang_class, args.source)

        # Tokenizer
        tok = TMTokenizer(lang.upper()).tokenizer
        pos = TMPosTagger(lang.upper())

        for eline in file.readlines():
            tok_sentences = tok.process(eline)
            print(tok_sentences)