Example #1
0
    def tokenize(self, **kwargs):
        source = self.filter.filter(kwargs["q"])
        target = self.filter.filter(kwargs["t"])

        # pre-processing of source and target
        source_tmp = self.external_processors.tokenize(source)
        source_tmp2 = self.external_processors.truecase(source_tmp)
        source_tok = self.external_processors.prepro(source_tmp2)

        target_tmp = self.tgt_external_processors.tokenize(target)
        target_tmp2 = self.tgt_external_processors.truecase(target_tmp)
        target_tok = self.tgt_external_processors.prepro(target_tmp2)

        # get tokenized spans
        tracker = tokentracker.TokenTracker()
        tmp = tracker.tokenize(source_tok)
        source_spans = tracker.track_detok(source_tok, source, tmp)
        tmp = tracker.tokenize(target_tok)
        target_spans = tracker.track_detok(target_tok, target, tmp)

        align_data = {
            'sourceText': source,
            'targetText': target,
            'tokenization': {
                'src': source_spans,
                'tgt': target_spans
            }
        }
        data = {"data": align_data}
        return self._dump_json(data)
Example #2
0
    def align(self, **kwargs):
        response = cherrypy.response
        response.headers['Content-Type'] = 'application/json'

        if self.symal == None:
            message = "need bidirectional aligner for updates"
            return {"error": {"code": 400, "message": message}}

        source = self.filter.filter(kwargs["q"])
        target = self.filter.filter(kwargs["t"])

        # pre-processing of source and target
        source_tmp = self.external_processors.tokenize(source)
        source_tmp2 = self.external_processors.truecase(source_tmp)
        source_tok = self.external_processors.prepro(source_tmp2)

        target_tmp = self.tgt_external_processors.tokenize(target)
        target_tmp2 = self.tgt_external_processors.truecase(target_tmp)
        target_tok = self.tgt_external_processors.prepro(target_tmp2)

        # word alignment
        mode = 's2t'
        if self.symal == None:
            message = "need bidirectional aligner for updates"
            return {"error": {"code": 400, "message": message}}
        alignment = ''
        if mode == 's2t':
            alignment = self.symal.align_s2t(source_tok, target_tok)
        elif mode == 't2f':
            alignment = self.symal.align_t2s(source_tok, target_tok)
        elif mode == 'sym':
            alignment = self.symal.symal(source_tok, target_tok)
        else:
            message = "unknow alignment mode %s" % mode
            return {"error": {"code": 400, "message": message}}
        alignment = [
            point for point in alignment if point[0] != -1 and point[1] != -1
        ]

        # get tokenized spans
        tracker = tokentracker.TokenTracker()
        tmp = tracker.tokenize(source_tok)
        source_spans = tracker.track_detok(source_tok, source, tmp)
        tmp = tracker.tokenize(target_tok)
        target_spans = tracker.track_detok(target_tok, target, tmp)

        align_data = {
            'sourceText': source,
            'targetText': target,
            'alignment': alignment,
            'tokenization': {
                'src': source_spans,
                'tgt': target_spans
            }
        }
        data = {"data": align_data}
        return self._dump_json(data)
Example #3
0
    def _getTranslation(self, hyp):
        """ does all the extraction and postprocessing, returns dict including
            translatedText, spans, and, if available, phraseAlignment,
            and WordAlignment
        """
        translationDict = {}
        translation = hyp.strip()

        self.log_info("Translation before extraction: %s" % translation)
        translation = self.external_processors.extract(translation)
        self.log_info("Translation after extraction: %s" % translation)

        translation, phraseAlignment = self._getPhraseAlignment(translation)
        self.log_info("Phrase alignment: %s" % str(phraseAlignment))
        self.log_info("Translation after removing phrase-alignment: %s" %
                      translation)

        translation, wordAlignment = self._getWordAlignment(translation)
        self.log_info("Word alignment: %s" % str(wordAlignment))
        self.log_info("Translation after removing word-alignment: %s" %
                      translation)

        translation = self._getOnlyTranslation(translation).strip()
        self.log_info("Translation after removing additional info: %s" %
                      translation)

        self.log_info("Translation before postprocessing: %s" % translation)
        tt = tokentracker.TokenTracker()
        raw_translation = translation
        spans = tt.tokenize(raw_translation)
        translation = self.external_processors.postpro(translation)
        spans = tt.track_detok(raw_translation, translation, spans)
        raw_translation = translation
        translation = self.external_processors.detruecase(translation)
        spans = tt.track_detok(raw_translation, translation, spans)
        raw_translation = translation
        translation = self.external_processors.detokenize(translation)
        spans = tt.track_detok(raw_translation, translation, spans)
        if not "tokenization" in translationDict:
            translationDict["tokenization"] = {}
        translationDict["tokenization"].update({'tgt': spans})

        self.log_info("Translation after postprocessing: %s" % translation)

        if translation:
            translationDict["translatedText"] = translation
        else:
            translationDict["translatedText"] = ''
        if phraseAlignment:
            translationDict["phraseAlignment"] = phraseAlignment
        if wordAlignment:
            translationDict["wordAlignment"] = wordAlignment

        return translationDict
Example #4
0
 def _track_postprocessing(self, sentence, verbose=False):
     processors = self.external_processors
     tracker = tokentracker.TokenTracker()
     sentence_detruecased = processors.detruecase(sentence)
     sentence_detokenized = processors.detokenize(sentence_detruecased)
     sentence_postprocessed = processors.postpro(sentence_detokenized)
     spans = tracker.track_detok(sentence,
                                 sentence_detruecased,
                                 verbose=verbose)
     spans = tracker.track_detok(sentence_detruecased,
                                 sentence_detokenized,
                                 spans=spans,
                                 verbose=verbose,
                                 check_escape=True)
     spans = tracker.track_detok(sentence_detokenized,
                                 sentence_postprocessed,
                                 spans=spans,
                                 verbose=verbose)
     return sentence_postprocessed, spans
Example #5
0
 def _track_preprocessing(self, sentence, is_source, verbose=False):
     processors = self.external_processors if is_source else self.tgt_external_processors
     sentence_preprocessed = processors.prepro(sentence)
     sentence_tokenized = processors.tokenize(sentence_preprocessed)
     sentence_truecased = processors.truecase(sentence_tokenized)
     tracker = tokentracker.TokenTracker()
     # tracker applied in opposite direction as final spans refer to input
     spans = tracker.track_detok(sentence_truecased,
                                 sentence_tokenized,
                                 verbose=verbose)
     spans = tracker.track_detok(sentence_tokenized,
                                 sentence_preprocessed,
                                 spans=spans,
                                 verbose=verbose,
                                 check_escape=True)
     spans = tracker.track_detok(sentence_preprocessed,
                                 sentence,
                                 spans=spans,
                                 verbose=verbose)
     return sentence_truecased, spans
Example #6
0
    def translate(self, **kwargs):
        response = cherrypy.response
        response.headers['Content-Type'] = 'application/json'

        errors = self._check_params(kwargs)
        if errors:
            cherrypy.response.status = 400
            return self._dump_json(errors)

        q = self.filter.filter(kwargs["q"])
        raw_src = q
        self.log("The server is working on: %s" % repr(q))
        self.log_info("Request before preprocessing: %s" % repr(q))
        translationDict = {"sourceText": q.strip()}
        q = self.external_processors.tokenize(q)
        q = self.external_processors.truecase(q)
        q = self.external_processors.prepro(q)

        self.log_info("Request after preprocessing: %s" % repr(q))
        preprocessed_src = q
        tt = tokentracker.TokenTracker()
        src_spans = tt.tokenize(preprocessed_src)
        src_spans = tt.track_detok(preprocessed_src, raw_src, src_spans)

        self.log_info("Request before annotation: %s" % repr(q))
        q = self.external_processors.annotate(q)
        not_annotated_src = self._getOnlyTranslation(q)
        assert len(preprocessed_src.split()) == len(not_annotated_src.split()), \
                        "annotation should not change number of tokens"

        self.log_info("Request after annotation (q): %s" % repr(q))

        translation = ''
        report_search_graph = 'sg' in kwargs
        report_translation_options = 'topt' in kwargs
        report_alignment = 'align' in kwargs

        # how many -if any- entries do we need in the nbest list?
        nbest = 0
        if 'nbest' in kwargs:
            nbest = max(nbest, int(kwargs['nbest']))
        if 'wpp' in kwargs:
            nbest = max(nbest, int(kwargs['wpp']))

        result = self._translate(q,
                                 sg=report_search_graph,
                                 topt=report_translation_options,
                                 align=report_alignment,
                                 nbest=nbest)
        if 'text' in result:
            translation = result['text']
        else:
            return self._timeout_error(q, 'translation')
        #print result.keys()
        translationDict.update(self._getTranslation(translation))
        translationDict["tokenization"].update({'src': src_spans})

        if 'sg' in result:
            if self.prune_sg:
                translationDict['searchGraph'] = self.prune_search_graph(
                    result['sg'])
            else:
                translationDict['searchGraph'] = result['sg']
        if 'topt' in result:
            translationDict['topt'] = result['topt']
        if 'align' in result:
            translationDict['alignment'] = result['align']
        if 'nbest' in result:
            if 'nbest' in kwargs:
                n = int(kwargs['nbest'])
                n = min(n, len(result['nbest']))
                translationDict['raw_nbest'] = result['nbest'][:n]
                translationDict['nbest'] = []
                for nbest_result in result['nbest'][:n]:
                    hyp = nbest_result['hyp']
                    translationDict['nbest'].append(self._getTranslation(hyp))
            if 'wpp' in kwargs:
                buff = []
                n = int(kwargs['wpp'])
                n = min(n, len(result['nbest']))
                for nbest_result in result['nbest'][:n]:
                    hyp = nbest_result['hyp']
                    score = nbest_result['totalScore']
                    buff.append([0, hyp, score])
                word_posterior = wpp.WPP(align=True)
                probs = word_posterior.process_buff(buff, translation)
                translationDict['wpp'] = map(math.exp, probs)
                translationDict['wpp_score'] = geometric_mean(probs)

        data = {"data": {"translations": [translationDict]}}
        #self.log("The server is returning: %s" %self._dump_json(data))
        self.log("The server is returning:")
        return self._dump_json(data)