def tokenize(self, **kwargs): source = self.filter.filter(kwargs["q"]) target = self.filter.filter(kwargs["t"]) # pre-processing of source and target source_tmp = self.external_processors.tokenize(source) source_tmp2 = self.external_processors.truecase(source_tmp) source_tok = self.external_processors.prepro(source_tmp2) target_tmp = self.tgt_external_processors.tokenize(target) target_tmp2 = self.tgt_external_processors.truecase(target_tmp) target_tok = self.tgt_external_processors.prepro(target_tmp2) # get tokenized spans tracker = tokentracker.TokenTracker() tmp = tracker.tokenize(source_tok) source_spans = tracker.track_detok(source_tok, source, tmp) tmp = tracker.tokenize(target_tok) target_spans = tracker.track_detok(target_tok, target, tmp) align_data = { 'sourceText': source, 'targetText': target, 'tokenization': { 'src': source_spans, 'tgt': target_spans } } data = {"data": align_data} return self._dump_json(data)
def align(self, **kwargs): response = cherrypy.response response.headers['Content-Type'] = 'application/json' if self.symal == None: message = "need bidirectional aligner for updates" return {"error": {"code": 400, "message": message}} source = self.filter.filter(kwargs["q"]) target = self.filter.filter(kwargs["t"]) # pre-processing of source and target source_tmp = self.external_processors.tokenize(source) source_tmp2 = self.external_processors.truecase(source_tmp) source_tok = self.external_processors.prepro(source_tmp2) target_tmp = self.tgt_external_processors.tokenize(target) target_tmp2 = self.tgt_external_processors.truecase(target_tmp) target_tok = self.tgt_external_processors.prepro(target_tmp2) # word alignment mode = 's2t' if self.symal == None: message = "need bidirectional aligner for updates" return {"error": {"code": 400, "message": message}} alignment = '' if mode == 's2t': alignment = self.symal.align_s2t(source_tok, target_tok) elif mode == 't2f': alignment = self.symal.align_t2s(source_tok, target_tok) elif mode == 'sym': alignment = self.symal.symal(source_tok, target_tok) else: message = "unknow alignment mode %s" % mode return {"error": {"code": 400, "message": message}} alignment = [ point for point in alignment if point[0] != -1 and point[1] != -1 ] # get tokenized spans tracker = tokentracker.TokenTracker() tmp = tracker.tokenize(source_tok) source_spans = tracker.track_detok(source_tok, source, tmp) tmp = tracker.tokenize(target_tok) target_spans = tracker.track_detok(target_tok, target, tmp) align_data = { 'sourceText': source, 'targetText': target, 'alignment': alignment, 'tokenization': { 'src': source_spans, 'tgt': target_spans } } data = {"data": align_data} return self._dump_json(data)
def _getTranslation(self, hyp): """ does all the extraction and postprocessing, returns dict including translatedText, spans, and, if available, phraseAlignment, and WordAlignment """ translationDict = {} translation = hyp.strip() self.log_info("Translation before extraction: %s" % translation) translation = self.external_processors.extract(translation) self.log_info("Translation after extraction: %s" % translation) translation, phraseAlignment = self._getPhraseAlignment(translation) self.log_info("Phrase alignment: %s" % str(phraseAlignment)) self.log_info("Translation after removing phrase-alignment: %s" % translation) translation, wordAlignment = self._getWordAlignment(translation) self.log_info("Word alignment: %s" % str(wordAlignment)) self.log_info("Translation after removing word-alignment: %s" % translation) translation = self._getOnlyTranslation(translation).strip() self.log_info("Translation after removing additional info: %s" % translation) self.log_info("Translation before postprocessing: %s" % translation) tt = tokentracker.TokenTracker() raw_translation = translation spans = tt.tokenize(raw_translation) translation = self.external_processors.postpro(translation) spans = tt.track_detok(raw_translation, translation, spans) raw_translation = translation translation = self.external_processors.detruecase(translation) spans = tt.track_detok(raw_translation, translation, spans) raw_translation = translation translation = self.external_processors.detokenize(translation) spans = tt.track_detok(raw_translation, translation, spans) if not "tokenization" in translationDict: translationDict["tokenization"] = {} translationDict["tokenization"].update({'tgt': spans}) self.log_info("Translation after postprocessing: %s" % translation) if translation: translationDict["translatedText"] = translation else: translationDict["translatedText"] = '' if phraseAlignment: translationDict["phraseAlignment"] = phraseAlignment if wordAlignment: translationDict["wordAlignment"] = wordAlignment return translationDict
def _track_postprocessing(self, sentence, verbose=False): processors = self.external_processors tracker = tokentracker.TokenTracker() sentence_detruecased = processors.detruecase(sentence) sentence_detokenized = processors.detokenize(sentence_detruecased) sentence_postprocessed = processors.postpro(sentence_detokenized) spans = tracker.track_detok(sentence, sentence_detruecased, verbose=verbose) spans = tracker.track_detok(sentence_detruecased, sentence_detokenized, spans=spans, verbose=verbose, check_escape=True) spans = tracker.track_detok(sentence_detokenized, sentence_postprocessed, spans=spans, verbose=verbose) return sentence_postprocessed, spans
def _track_preprocessing(self, sentence, is_source, verbose=False): processors = self.external_processors if is_source else self.tgt_external_processors sentence_preprocessed = processors.prepro(sentence) sentence_tokenized = processors.tokenize(sentence_preprocessed) sentence_truecased = processors.truecase(sentence_tokenized) tracker = tokentracker.TokenTracker() # tracker applied in opposite direction as final spans refer to input spans = tracker.track_detok(sentence_truecased, sentence_tokenized, verbose=verbose) spans = tracker.track_detok(sentence_tokenized, sentence_preprocessed, spans=spans, verbose=verbose, check_escape=True) spans = tracker.track_detok(sentence_preprocessed, sentence, spans=spans, verbose=verbose) return sentence_truecased, spans
def translate(self, **kwargs): response = cherrypy.response response.headers['Content-Type'] = 'application/json' errors = self._check_params(kwargs) if errors: cherrypy.response.status = 400 return self._dump_json(errors) q = self.filter.filter(kwargs["q"]) raw_src = q self.log("The server is working on: %s" % repr(q)) self.log_info("Request before preprocessing: %s" % repr(q)) translationDict = {"sourceText": q.strip()} q = self.external_processors.tokenize(q) q = self.external_processors.truecase(q) q = self.external_processors.prepro(q) self.log_info("Request after preprocessing: %s" % repr(q)) preprocessed_src = q tt = tokentracker.TokenTracker() src_spans = tt.tokenize(preprocessed_src) src_spans = tt.track_detok(preprocessed_src, raw_src, src_spans) self.log_info("Request before annotation: %s" % repr(q)) q = self.external_processors.annotate(q) not_annotated_src = self._getOnlyTranslation(q) assert len(preprocessed_src.split()) == len(not_annotated_src.split()), \ "annotation should not change number of tokens" self.log_info("Request after annotation (q): %s" % repr(q)) translation = '' report_search_graph = 'sg' in kwargs report_translation_options = 'topt' in kwargs report_alignment = 'align' in kwargs # how many -if any- entries do we need in the nbest list? nbest = 0 if 'nbest' in kwargs: nbest = max(nbest, int(kwargs['nbest'])) if 'wpp' in kwargs: nbest = max(nbest, int(kwargs['wpp'])) result = self._translate(q, sg=report_search_graph, topt=report_translation_options, align=report_alignment, nbest=nbest) if 'text' in result: translation = result['text'] else: return self._timeout_error(q, 'translation') #print result.keys() translationDict.update(self._getTranslation(translation)) translationDict["tokenization"].update({'src': src_spans}) if 'sg' in result: if self.prune_sg: translationDict['searchGraph'] = self.prune_search_graph( result['sg']) else: translationDict['searchGraph'] = result['sg'] if 'topt' in result: translationDict['topt'] = result['topt'] if 'align' in result: translationDict['alignment'] = result['align'] if 'nbest' in result: if 'nbest' in kwargs: n = int(kwargs['nbest']) n = min(n, len(result['nbest'])) translationDict['raw_nbest'] = result['nbest'][:n] translationDict['nbest'] = [] for nbest_result in result['nbest'][:n]: hyp = nbest_result['hyp'] translationDict['nbest'].append(self._getTranslation(hyp)) if 'wpp' in kwargs: buff = [] n = int(kwargs['wpp']) n = min(n, len(result['nbest'])) for nbest_result in result['nbest'][:n]: hyp = nbest_result['hyp'] score = nbest_result['totalScore'] buff.append([0, hyp, score]) word_posterior = wpp.WPP(align=True) probs = word_posterior.process_buff(buff, translation) translationDict['wpp'] = map(math.exp, probs) translationDict['wpp_score'] = geometric_mean(probs) data = {"data": {"translations": [translationDict]}} #self.log("The server is returning: %s" %self._dump_json(data)) self.log("The server is returning:") return self._dump_json(data)