Beispiel #1
0
    def __init__(self, source_lang, target_lang, key=None):
        TranslateEngine.__init__(self, source_lang, target_lang)
        self._key = key if key is not None else self.DEFAULT_GOOGLE_KEY
        self._delay = 0
        self._xml_encoder = XMLEncoder()

        self._url = 'https://translation.googleapis.com/language/translate/v2'
Beispiel #2
0
    def translate_text(self, text):
        text_has_xml = XMLEncoder.has_xml_tag(text)

        if not text_has_xml:
            text = XMLEncoder.unescape(text)

        data = {
            'model':
            'nmt',
            'source':
            self._normalize_language(self.source_lang),
            'target':
            self._normalize_language(self.target_lang),
            'q':
            text,
            'key':
            self._key,
            'userip':
            '.'.join(map(str, (random.randint(0, 200) for _ in range(4))))
        }

        headers = {'X-HTTP-Method-Override': 'GET'}

        rate_limit_reached = False
        server_error_count = 0

        while True:
            if self._delay > 0:
                delay = self._delay * random.uniform(0.5, 1)
                time.sleep(delay)

            r = requests.post(self._url, data=data, headers=headers)

            if r.status_code != requests.codes.ok:
                e = self._pack_error(r)
                if isinstance(e, GoogleRateLimitError):
                    rate_limit_reached = True
                    self._increment_delay()
                elif isinstance(e, GoogleServerError):
                    server_error_count += 1

                    if server_error_count < 10:
                        time.sleep(1.)
                    else:
                        raise e
                else:
                    raise e
            else:
                break

        if not rate_limit_reached and self._delay > 0:
            self._decrement_delay()

        translation = r.json()['data']['translations'][0]['translatedText']

        if not text_has_xml:
            translation = XMLEncoder.escape(translation)

        return translation
Beispiel #3
0
    def __init__(self, node, google_key=None, google_nmt=False):
        self._engine = node.engine
        self._node = node

        self._heval_outputter = HumanEvaluationFileOutputter()
        self._xmlencoder = XMLEncoder()
        self._translators = [
            GoogleTranslate(self._engine.source_lang, self._engine.target_lang, key=google_key, nmt=google_nmt),
            # BingTranslator(source_lang, target_lang),
            MMTTranslator(self._node)
        ]
Beispiel #4
0
        def _export_he_file(src, dest_folder):
            filename = os.path.basename(src)
            dest = os.path.join(dest_folder, filename)
            lang = os.path.splitext(filename)[1][1:]

            with open(src, 'r', encoding='utf-8') as src_in, open(dest, 'w', encoding='utf-8') as dest_out:
                for i, line in enumerate(src_in):
                    line = XMLEncoder.encode(line)
                    dest_out.write('%d\t%s\t%s' % (i, lang, line.replace('\t', ' ')))
Beispiel #5
0
class GoogleTranslate(TranslateEngine):
    DEFAULT_GOOGLE_KEY = 'AIzaSyBl9WAoivTkEfRdBBSCs4CruwnGL_aV74c'

    def __init__(self, source_lang, target_lang, key=None):
        TranslateEngine.__init__(self, source_lang, target_lang)
        self._key = key if key is not None else self.DEFAULT_GOOGLE_KEY
        self._delay = 0
        self._xml_encoder = XMLEncoder()

        self._url = 'https://translation.googleapis.com/language/translate/v2'

    @property
    def name(self):
        return 'Google Translate'

    def _get_default_threads(self):
        return 5

    @staticmethod
    def _pack_error(request):
        json = request.json()

        if request.status_code == 403:
            for error in json['error']['errors']:
                if error['reason'] == 'dailyLimitExceeded':
                    return TranslateError('Google Translate free quota is over. Please use option --gt-key'
                                          ' to specify your GT API key.')
                elif error['reason'] == 'userRateLimitExceeded':
                    return GoogleRateLimitError('Google Translate rate limit exceeded')
        elif 500 <= request.status_code < 600:
            return GoogleServerError('Google Translate server error (%d): %s' %
                                     (request.status_code, json['error']['message']))

        return TranslateError('Google Translate error (%d): %s' % (request.status_code, json['error']['message']))

    def _increment_delay(self):
        if self._delay < 0.002:
            self._delay = 0.05
        else:
            self._delay = min(1, self._delay * 1.05)

    def _decrement_delay(self):
        self._delay *= 0.95

        if self._delay < 0.002:
            self._delay = 0

    def translate_text(self, text):
        data = {
            'model': 'nmt',
            'source': map_language(self.source_lang),
            'target': map_language(self.target_lang),
            'q': text,
            'key': self._key,
            'userip': '.'.join(map(str, (random.randint(0, 200) for _ in range(4))))
        }

        headers = {
            'X-HTTP-Method-Override': 'GET'
        }

        rate_limit_reached = False
        server_error_count = 0

        while True:
            if self._delay > 0:
                delay = self._delay * random.uniform(0.5, 1)
                time.sleep(delay)

            r = requests.post(self._url, data=data, headers=headers)

            if r.status_code != requests.codes.ok:
                e = self._pack_error(r)
                if isinstance(e, GoogleRateLimitError):
                    rate_limit_reached = True
                    self._increment_delay()
                elif isinstance(e, GoogleServerError):
                    server_error_count += 1

                    if server_error_count < 10:
                        time.sleep(1.)
                    else:
                        raise e
                else:
                    raise e
            else:
                break

        if not rate_limit_reached and self._delay > 0:
            self._decrement_delay()

        translation = r.json()['data']['translations'][0]['translatedText']
        translation = self._xml_encoder.encode_string(translation)

        return translation
Beispiel #6
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        corpora = [
            corpus for corpus in corpora if self._source_lang in corpus.langs
            and self._target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (self._source_lang, self._target_lang))

        print '\n============== EVALUATION ==============\n'
        print 'Testing on %d lines:\n' % sum(
            [corpus.count_lines() for corpus in corpora])

        if heval_output is not None:
            osutils.makedirs(heval_output, exist_ok=True)

        step_logger = _StepLogger()
        human_eval_outputter = HumanEvaluationFileOutputter(
        ) if heval_output is not None else None

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            # Process references
            with step_logger.step('Preparing corpora') as _:
                source = os.path.join(working_dir,
                                      'source.' + self._source_lang)
                osutils.concat(
                    [corpus.get_file(self._source_lang) for corpus in corpora],
                    source)

                reference = os.path.join(working_dir,
                                         'reference.' + self._target_lang)
                osutils.concat(
                    [corpus.get_file(self._target_lang) for corpus in corpora],
                    reference + '.tmp')
                XMLEncoder().encode_file(reference + '.tmp', reference)
                os.remove(reference + '.tmp')

                if human_eval_outputter is not None:
                    human_eval_outputter.write(lang=self._target_lang,
                                               input_file=reference,
                                               output_file=os.path.join(
                                                   heval_output, 'reference.' +
                                                   self._target_lang))
                    human_eval_outputter.write(lang=self._source_lang,
                                               input_file=source,
                                               output_file=os.path.join(
                                                   heval_output, 'source.' +
                                                   self._source_lang))

                total_line_count = osutils.lc(reference)

            # Translate
            entries = []
            for translator in self._translators:
                with step_logger.step('Translating with %s' %
                                      translator.name) as _:
                    entry = self._translate_with(translator, corpora,
                                                 working_dir, total_line_count)
                    entries.append(entry)

                    if entry.error is None and human_eval_outputter is not None:
                        human_eval_file = os.path.join(
                            heval_output,
                            os.path.basename(entry.translation_file))
                        human_eval_outputter.write(
                            lang=self._target_lang,
                            input_file=entry.translation_file,
                            output_file=human_eval_file)

            # Scoring
            for scorer in self._scorers:
                with step_logger.step('Calculating %s' % scorer.name()) as _:
                    for entry in entries:
                        if entry.error is not None:
                            continue
                        try:
                            entry.scores[scorer] = scorer.calculate(
                                entry.translation_file, reference)
                        except Exception as e:
                            entry.scores[scorer] = str(e)

            # Print results
            print '\n=============== RESULTS ================\n'

            for scorer in self._scorers:
                print scorer.name() + ':'

                for i, entry in enumerate(
                        sorted(entries,
                               key=lambda x: x.scores[scorer]
                               if x.error is None else 0,
                               reverse=True)):
                    if entry.error is None:
                        value = entry.scores[scorer]
                        if isinstance(value, basestring):
                            text = value
                        else:
                            text = '%.2f' % (value * 100)
                            if i == 0:
                                text += ' (Winner)'
                    else:
                        text = str(entry.error)

                    print '  %s: %s' % (entry.translator.name.ljust(20), text)
                print

            print 'Translation Speed:'
            for entry in sorted(entries,
                                key=lambda x: x.translation_time
                                if x.error is None else float('inf')):
                if entry.error is None:
                    text = '%.2fs per sentence' % entry.translation_time
                else:
                    text = str(entry.error)

                print '  %s: %s' % (entry.translator.name.ljust(20), text)
            print
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
Beispiel #7
0
class Evaluator:
    def __init__(self, node, google_key=None, google_nmt=False):
        self._engine = node.engine
        self._node = node

        self._heval_outputter = HumanEvaluationFileOutputter()
        self._xmlencoder = XMLEncoder()
        self._translators = [
            GoogleTranslate(self._engine.source_lang, self._engine.target_lang, key=google_key, nmt=google_nmt),
            # BingTranslator(source_lang, target_lang),
            MMTTranslator(self._node)
        ]

    def evaluate(self, corpora, heval_output=None, debug=False):
        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        corpora = [corpus for corpus in corpora if source_lang in corpus.langs and target_lang in corpus.langs]
        if len(corpora) == 0:
            raise IllegalArgumentException('No %s > %s corpora found into specified path' % (source_lang, target_lang))

        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue

                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
Beispiel #8
0
 def _serialize_tokens(tokens):
     tokens = [
         XMLEncoder.unescape(text) for text, _ in tokens
         if not XMLEncoder.is_xml_tag(text)
     ]
     return u' '.join(tokens)
Beispiel #9
0
    def __init__(self, node, use_sessions=True):
        self._engine = node.engine
        self._node = node

        self._xmlencoder = XMLEncoder()
        self._translator = MMTTranslator(self._node, use_sessions)
Beispiel #10
0
class BatchTranslator:
    def __init__(self, node, use_sessions=True):
        self._engine = node.engine
        self._node = node

        self._xmlencoder = XMLEncoder()
        self._translator = MMTTranslator(self._node, use_sessions)

    def translate(self, corpora, dest_path=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if dest_path:
            fileutils.makedirs(dest_path, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        working_dir = self._engine.get_tempdir('evaluation')
        have_references = False

        try:
            results = []

            # Process references
            corpora_path = os.path.join(working_dir, 'corpora')
            corpora = self._xmlencoder.encode(corpora, corpora_path)

            reference = os.path.join(working_dir, 'reference.' + target_lang)
            source = os.path.join(working_dir, 'source.' + source_lang)
            refs = [
                corpus.get_file(target_lang) for corpus in corpora
                if corpus.get_file(target_lang)
            ]
            have_references = len(refs) > 0
            fileutils.merge(refs, reference)  # tolerates missing reference
            fileutils.merge(
                [corpus.get_file(source_lang) for corpus in corpora], source)

            if dest_path:
                for corpus in corpora:
                    corpus.copy(dest_path,
                                suffixes={
                                    source_lang: '.src',
                                    target_lang: '.ref',
                                    'tmx': '.src'
                                })

            # Translate
            translator = self._translator
            name = translator.name()

            result = _EvaluationResult(translator)
            results.append(result)

            translations_path = os.path.join(working_dir, 'translations',
                                             result.id + '.raw')
            xmltranslations_path = os.path.join(working_dir, 'translations',
                                                result.id)
            fileutils.makedirs(translations_path, exist_ok=True)

            try:
                translated, mtt, parallelism = translator.translate(
                    corpora, translations_path)
                filename = result.id + '.' + target_lang

                result.mtt = mtt
                result.parallelism = parallelism
                result.translated_corpora = self._xmlencoder.encode(
                    translated, xmltranslations_path)
                result.merge = os.path.join(working_dir, filename)

                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in result.translated_corpora
                ], result.merge)

                if dest_path:
                    for corpus in result.translated_corpora:
                        corpus.copy(dest_path,
                                    suffixes={
                                        target_lang: '.hyp',
                                        'tmx': '.hyp'
                                    })

            except TranslateError as e:
                result.error = e
            except Exception as e:
                result.error = TranslateError('Unexpected ERROR: ' +
                                              str(e.message))

            if result.error is None:
                if have_references:
                    scorer = BLEUScore()
                    # bleu in range [0;1)
                    bleu = scorer.calculate(result.merge, reference)
                    return bleu
                else:
                    return True
            else:
                print(result.error)
                return None
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')