コード例 #1
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
    def __init__(self, lang_code, protect=False, protected_patterns_path=None, escape=True):
        @param lang_code language identifier
        @param protect whether the tokenizer should respect patterns that should not be tokenized
        @param protected_patterns_path path to file with protected patterns
        @param escape whether characters that break the Moses decoder should be escaped
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
            '-a',  # aggressive mode

        if protect:
                '-protected %s' % protected_patterns_path,  # protect e.g. inline XML, URLs and email

        if not escape:
                '-no-escape'  # do not escape reserved characters in Moses

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TOKENIZER] + arguments)
コード例 #2
ファイル: engine.py プロジェクト: kuserich/mtrain
    def __init__(self, path_moses_ini, report_alignment=False, report_segmentation=False):
        @param path_moses_ini path to Moses configuration file
        @param report_alignment whether Moses should report word alignments
        @param report_segmentation whether Moses should report how the translation
            is made up of phrases
        self._path_moses_ini = path_moses_ini
        self._report_alignment = report_alignment
        self._report_segmentation = report_segmentation

        arguments = [
            '-f %s' % self._path_moses_ini,
            '-minphr-memory', # compact phrase table
            '-minlexr-memory', # compact reordering table
            '-v 0', # as quiet as possible
            '-xml-input constraint' # allow forced translations and zones
        trailing_output = False

        if self._report_alignment:
            trailing_output = True
        if self._report_segmentation:

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES] + arguments),
コード例 #3
 def __init__(self, path_moses_ini):
     arguments = [
         '-f %s' % path_moses_ini,
         '-dl 0',
         '-v 0',
     self._processor = ExternalProcessor(command=" ".join([MOSES] +
コード例 #4
    def __init__(self):
        Detruecaser that is a script, no model training.
        arguments = [
            '-b'  # disable Perl buffering

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_DETRUECASER] + arguments))
コード例 #5
    def __init__(self, path_model):
        @param path_model path to truecasing model trained in `mtrain`
        arguments = [
            '-model %s' % path_model,
            '-b'  #disable Perl buffering

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TRUECASER] + arguments))
コード例 #6
    def __init__(self, lang_code):
        @param lang_code language identifier
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
        ]   # no aggressive mode '-a' for normalizer

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_NORMALIZER] + arguments)
コード例 #7
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
class Detokenizer(object):
    Creates a detokenizer which detokenizes lists of tokens on-the-fly, i.e.,
    allowing interaction with a Moses detokenizer process kept in memory.

    def __init__(self, lang_code, uppercase_first_letter=False):
        @param lang_code language identifier
        @param uppercase_first_letter whether or not to uppercase the first
            letter in the detokenized output.
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
        if uppercase_first_letter:
        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_DETOKENIZER] + arguments),

    def close(self):
        del self._processor

    def detokenize(self, tokens):
        Detokenizes a list of @param tokens into a segment
        return self._processor.process(" ".join(tokens))
コード例 #8
class Recaser(object):
    Creates a recaser which recases sentences on-the-fly, i.e., allowing
    interaction with a Moses recaser engine kept in memory.
    def __init__(self, path_moses_ini):
        arguments = [
            '-f %s' % path_moses_ini,
            '-dl 0',
            '-v 0',
        self._processor = ExternalProcessor(command=" ".join([MOSES] +

    def close(self):
        del self._processor

    def recase(self, segment):
        Recases a single segment.
        return self._processor.process(segment)

    def recase_tokens(self, tokens):
        Recases a list of tokens.
        return self.recase(" ".join(tokens)).split(" ")
コード例 #9
class Normalizer(object):
    Creates a normalizer for processing segment by segment, allowing
    interaction with a normalizer process kept in memory.

    def __init__(self, lang_code):
        @param lang_code language identifier
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
        ]   # no aggressive mode '-a' for normalizer

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_NORMALIZER] + arguments)

    def close(self):
        del self._processor

    def normalize_punctuation(self, segment):
        Normalizes punctuation characters of a single @param segment.
        normalized_segment = self._processor.process(segment)
        return normalized_segment
コード例 #10
class Detruecaser(object):
    Creates a detruecaser which detruecases sentences on-the-fly, i.e., allowing
    interaction with a Moses truecaser process kept in memory.
    def __init__(self):
        Detruecaser that is a script, no model training.
        arguments = [
            '-b'  # disable Perl buffering

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_DETRUECASER] + arguments))

    def close(self):
        del self._processor

    def detruecase_segment(self, segment):
        Detruecases a single segment.
        return self._processor.process(segment)

    def detruecase_tokens(self, tokens):
        Detruecases a list of tokens.
        detruecased_segment = self.detruecase_segment(" ".join(tokens))
        return detruecased_segment.split(" ")
コード例 #11
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
 def __init__(self, lang_code, uppercase_first_letter=False):
     @param lang_code language identifier
     @param uppercase_first_letter whether or not to uppercase the first
         letter in the detokenized output.
     arguments = [
         '-l %s' % lang_code,
         '-b',  # disable Perl buffering
         '-q',  # don't report version
     if uppercase_first_letter:
     self._processor = ExternalProcessor(
         command=" ".join([C.MOSES_DETOKENIZER] + arguments),
コード例 #12
    def __init__(self, bpe_model_path, vocab_path=None):
        @param bpe_model_path full path to BPE model
        @param vocab_path optional path to vocabulary file
        arguments = [
            '-c %s' % bpe_model_path
        if vocab_path is not None:
                '--vocabulary %s' % vocab_path,
                '--vocabulary-threshold %d' % C.BPE_VOCAB_THRESHOLD

        # the subword script apply_bpe.py needs to be run in a Python 3 environment,
        # a constant is used to avoid version problems
        self._processor = ExternalProcessor(
            command=" ".join([C.PYTHON3] + [C.SUBWORD_NMT_APPLY] + arguments),
コード例 #13
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
class Tokenizer(object):
    Creates a tokenizer which tokenizes sentences on-the-fly, i.e., allowing
    interaction with a Moses tokenizer process kept in memory.

    def __init__(self, lang_code, protect=False, protected_patterns_path=None, escape=True):
        @param lang_code language identifier
        @param protect whether the tokenizer should respect patterns that should not be tokenized
        @param protected_patterns_path path to file with protected patterns
        @param escape whether characters that break the Moses decoder should be escaped
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
            '-a',  # aggressive mode

        if protect:
                '-protected %s' % protected_patterns_path,  # protect e.g. inline XML, URLs and email

        if not escape:
                '-no-escape'  # do not escape reserved characters in Moses

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TOKENIZER] + arguments)

    def close(self):
        del self._processor

    def tokenize(self, segment, split=True):
        Tokenizes a single @param segment.

        @param split determines if a tokenized segmet should be split by a space
        tokenized_segment = self._processor.process(segment)
        if split:
            return tokenized_segment.split(" ")
        return tokenized_segment
コード例 #14
class BytePairEncoderSegment(object):
    Applies a trained BPE model to individual segments.
    def __init__(self, bpe_model_path, vocab_path=None):
        @param bpe_model_path full path to BPE model
        @param vocab_path optional path to vocabulary file
        arguments = [
            '-c %s' % bpe_model_path
        if vocab_path is not None:
                '--vocabulary %s' % vocab_path,
                '--vocabulary-threshold %d' % C.BPE_VOCAB_THRESHOLD

        # the subword script apply_bpe.py needs to be run in a Python 3 environment,
        # a constant is used to avoid version problems
        self._processor = ExternalProcessor(
            command=" ".join([C.PYTHON3] + [C.SUBWORD_NMT_APPLY] + arguments),

    def close(self):
        Deletes reference to obsolete objects.
        del self._processor

    def encode_segment(self, segment):
        Encodes a single @param segment by applying a trained BPE model.
        encoded_segment = self._processor.process(segment)
        return encoded_segment
コード例 #15
class Truecaser(object):
    Creates a truecaser which truecases sentences on-the-fly, i.e., allowing
    interaction with a Moses truecaser process kept in memory.
    def __init__(self, path_model):
        @param path_model path to truecasing model trained in `mtrain`
        arguments = [
            '-model %s' % path_model,
            '-b'  #disable Perl buffering

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TRUECASER] + arguments))

    def close(self):
        Deletes object to free up memory.
        del self._processor

    def truecase_segment(self, segment):
        Truecases a single segment.
        return self._processor.process(segment)

    def truecase_tokens(self, tokens, split=True):
        Truecases a list of tokens.
        truecased_string = self.truecase_segment(" ".join(tokens))
        if split:
            return truecased_string.split(" ")
        return truecased_string
コード例 #16
ファイル: engine.py プロジェクト: kuserich/mtrain
class EngineMoses(object):
    Starts a translation engine process for moses backend and keep it running.
    def __init__(self, path_moses_ini, report_alignment=False, report_segmentation=False):
        @param path_moses_ini path to Moses configuration file
        @param report_alignment whether Moses should report word alignments
        @param report_segmentation whether Moses should report how the translation
            is made up of phrases
        self._path_moses_ini = path_moses_ini
        self._report_alignment = report_alignment
        self._report_segmentation = report_segmentation

        arguments = [
            '-f %s' % self._path_moses_ini,
            '-minphr-memory', # compact phrase table
            '-minlexr-memory', # compact reordering table
            '-v 0', # as quiet as possible
            '-xml-input constraint' # allow forced translations and zones
        trailing_output = False

        if self._report_alignment:
            trailing_output = True
        if self._report_segmentation:

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES] + arguments),

    def close(self):
        del self._processor

    def _extract_alignment(self, alignment_string):
        Transforms a word alignment string into an easily
            accessible dictionary {source: [target, ...], ...}
        @param alignment_string the exact string returned by Moses
            that contains alignment information
        alignments = defaultdict(list)

        for alignment in alignment_string.strip().split(" "):
            source, target = [int(string) for string in alignment.split("-")]

        return alignments

    def _separate_tokens_from_segmentation(self, translation):
        Transform phrase segmentation strings into easily
            accessible dictionary:
            {(source start, source end): (target start, target end), ...}
        @param translation a translation string returned by Moses that does not
            contain word alignments anymore, but phrase segmentation is still
        tokens = []
        segmentation = {}
        current_phrase_indexes = []
        current_index = 0

        for string in translation.split(" "):
            if '|' in string:
                current_segmentation = string.replace('|', '').split("-")
                if len(current_phrase_indexes) == 1:
                    current_phrase_indexes.append(current_phrase_indexes[0]) # duplicate single index

                current_key = tuple(int(index) for index in current_segmentation)
                segmentation[current_key] = tuple(int(index) for index in current_phrase_indexes)

                current_phrase_indexes = []
                if len(current_phrase_indexes) >= 2:
                current_index += 1

        return tokens, segmentation

    def _untangle_translation(self, translation):
        Separates the actual translation from reported segmentation
            and word alignments. Changes slightly the segmentation info
            by adding information about the source tokens.
        @param translation the exact string returned by a Moses engine
        if self._report_alignment:
            alignment = []
            parts = translation.split('|||')
            translation = parts[0].strip() # update translation to remove alignment info

            alignment = self._extract_alignment(parts[1])

        if self._report_segmentation:
            tokens, segmentation = self._separate_tokens_from_segmentation(translation)
            translation = " ".join(tokens) # update translation to only contain actual tokens

        return (
            alignment if self._report_alignment else None,
            segmentation if self._report_segmentation else None

    def translate_segment(self, segment):
        Translates a single input segment, @param segment.

        @return a TranslatedSegment object with a translation and,
        optionally, alignments and/or segmentation info
        translation = self._processor.process(segment)
        translation, alignment, segmentation = self._untangle_translation(translation)

        return TranslatedSegment(

    def translate_file(self, input_path, output_path):
        Translates an entire file.

        @param input_path path to temp file with preprocessed input segments
        @param output_path path to temp file were raw translations should be written

        raise NotImplementedError