コード例 #1
0
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
    def __init__(self, lang_code, protect=False, protected_patterns_path=None, escape=True):
        """
        @param lang_code language identifier
        @param protect whether the tokenizer should respect patterns that should not be tokenized
        @param protected_patterns_path path to file with protected patterns
        @param escape whether characters that break the Moses decoder should be escaped
        """
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
            '-a',  # aggressive mode
        ]

        if protect:
            arguments.append(
                '-protected %s' % protected_patterns_path,  # protect e.g. inline XML, URLs and email
            )

        if not escape:
            arguments.append(
                '-no-escape'  # do not escape reserved characters in Moses
            )

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TOKENIZER] + arguments)
        )
コード例 #2
0
ファイル: engine.py プロジェクト: kuserich/mtrain
    def __init__(self, path_moses_ini, report_alignment=False, report_segmentation=False):
        """
        @param path_moses_ini path to Moses configuration file
        @param report_alignment whether Moses should report word alignments
        @param report_segmentation whether Moses should report how the translation
            is made up of phrases
        """
        self._path_moses_ini = path_moses_ini
        self._report_alignment = report_alignment
        self._report_segmentation = report_segmentation

        arguments = [
            '-f %s' % self._path_moses_ini,
            '-minphr-memory', # compact phrase table
            '-minlexr-memory', # compact reordering table
            '-v 0', # as quiet as possible
            '-xml-input constraint' # allow forced translations and zones
        ]
        trailing_output = False

        if self._report_alignment:
            arguments.append('-print-alignment-info')
            trailing_output = True
        if self._report_segmentation:
            arguments.append('-report-segmentation')

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES] + arguments),
            stream_stderr=True,
            trailing_output=trailing_output
        )
コード例 #3
0
 def __init__(self, path_moses_ini):
     arguments = [
         '-f %s' % path_moses_ini,
         '-dl 0',
         '-minphr-memory',
         '-v 0',
     ]
     self._processor = ExternalProcessor(command=" ".join([MOSES] +
                                                          arguments))
コード例 #4
0
    def __init__(self):
        """
        Detruecaser that is a script, no model training.
        """
        arguments = [
            '-b'  # disable Perl buffering
        ]

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_DETRUECASER] + arguments))
コード例 #5
0
    def __init__(self, path_model):
        """
        @param path_model path to truecasing model trained in `mtrain`
        """
        arguments = [
            '-model %s' % path_model,
            '-b'  #disable Perl buffering
        ]

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TRUECASER] + arguments))
コード例 #6
0
    def __init__(self, lang_code):
        """
        @param lang_code language identifier
        """
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
        ]   # no aggressive mode '-a' for normalizer

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_NORMALIZER] + arguments)
        )
コード例 #7
0
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
class Detokenizer(object):
    """
    Creates a detokenizer which detokenizes lists of tokens on-the-fly, i.e.,
    allowing interaction with a Moses detokenizer process kept in memory.
    """

    def __init__(self, lang_code, uppercase_first_letter=False):
        """
        @param lang_code language identifier
        @param uppercase_first_letter whether or not to uppercase the first
            letter in the detokenized output.
        """
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
        ]
        if uppercase_first_letter:
            arguments.append('-u')
        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_DETOKENIZER] + arguments),
            stream_stderr=True
        )

    def close(self):
        del self._processor

    def detokenize(self, tokens):
        """
        Detokenizes a list of @param tokens into a segment
        """
        return self._processor.process(" ".join(tokens))
コード例 #8
0
class Recaser(object):
    '''
    Creates a recaser which recases sentences on-the-fly, i.e., allowing
    interaction with a Moses recaser engine kept in memory.
    '''
    def __init__(self, path_moses_ini):
        arguments = [
            '-f %s' % path_moses_ini,
            '-dl 0',
            '-minphr-memory',
            '-v 0',
        ]
        self._processor = ExternalProcessor(command=" ".join([MOSES] +
                                                             arguments))

    def close(self):
        del self._processor

    def recase(self, segment):
        '''
        Recases a single segment.
        '''
        return self._processor.process(segment)

    def recase_tokens(self, tokens):
        '''
        Recases a list of tokens.
        '''
        return self.recase(" ".join(tokens)).split(" ")
コード例 #9
0
class Normalizer(object):
    """
    Creates a normalizer for processing segment by segment, allowing
    interaction with a normalizer process kept in memory.
    """

    def __init__(self, lang_code):
        """
        @param lang_code language identifier
        """
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
        ]   # no aggressive mode '-a' for normalizer

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_NORMALIZER] + arguments)
        )

    def close(self):
        del self._processor

    def normalize_punctuation(self, segment):
        """
        Normalizes punctuation characters of a single @param segment.
        """
        normalized_segment = self._processor.process(segment)
        return normalized_segment
コード例 #10
0
class Detruecaser(object):
    """
    Creates a detruecaser which detruecases sentences on-the-fly, i.e., allowing
    interaction with a Moses truecaser process kept in memory.
    """
    def __init__(self):
        """
        Detruecaser that is a script, no model training.
        """
        arguments = [
            '-b'  # disable Perl buffering
        ]

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_DETRUECASER] + arguments))

    def close(self):
        del self._processor

    def detruecase_segment(self, segment):
        """
        Detruecases a single segment.
        """
        return self._processor.process(segment)

    def detruecase_tokens(self, tokens):
        """
        Detruecases a list of tokens.
        """
        detruecased_segment = self.detruecase_segment(" ".join(tokens))
        return detruecased_segment.split(" ")
コード例 #11
0
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
 def __init__(self, lang_code, uppercase_first_letter=False):
     """
     @param lang_code language identifier
     @param uppercase_first_letter whether or not to uppercase the first
         letter in the detokenized output.
     """
     arguments = [
         '-l %s' % lang_code,
         '-b',  # disable Perl buffering
         '-q',  # don't report version
     ]
     if uppercase_first_letter:
         arguments.append('-u')
     self._processor = ExternalProcessor(
         command=" ".join([C.MOSES_DETOKENIZER] + arguments),
         stream_stderr=True
     )
コード例 #12
0
    def __init__(self, bpe_model_path, vocab_path=None):
        """
        @param bpe_model_path full path to BPE model
        @param vocab_path optional path to vocabulary file
        """
        arguments = [
            '-c %s' % bpe_model_path
        ]
        if vocab_path is not None:
            arguments.extend([
                '--vocabulary %s' % vocab_path,
                '--vocabulary-threshold %d' % C.BPE_VOCAB_THRESHOLD
            ])

        # the subword script apply_bpe.py needs to be run in a Python 3 environment,
        # a constant is used to avoid version problems
        self._processor = ExternalProcessor(
            command=" ".join([C.PYTHON3] + [C.SUBWORD_NMT_APPLY] + arguments),
            stream_stderr=False,
            trailing_output=False,
            shell=False
        )
コード例 #13
0
ファイル: tokenizer.py プロジェクト: kuserich/mtrain
class Tokenizer(object):
    """
    Creates a tokenizer which tokenizes sentences on-the-fly, i.e., allowing
    interaction with a Moses tokenizer process kept in memory.
    """

    def __init__(self, lang_code, protect=False, protected_patterns_path=None, escape=True):
        """
        @param lang_code language identifier
        @param protect whether the tokenizer should respect patterns that should not be tokenized
        @param protected_patterns_path path to file with protected patterns
        @param escape whether characters that break the Moses decoder should be escaped
        """
        arguments = [
            '-l %s' % lang_code,
            '-b',  # disable Perl buffering
            '-q',  # don't report version
            '-a',  # aggressive mode
        ]

        if protect:
            arguments.append(
                '-protected %s' % protected_patterns_path,  # protect e.g. inline XML, URLs and email
            )

        if not escape:
            arguments.append(
                '-no-escape'  # do not escape reserved characters in Moses
            )

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TOKENIZER] + arguments)
        )

    def close(self):
        del self._processor

    def tokenize(self, segment, split=True):
        """
        Tokenizes a single @param segment.

        @param split determines if a tokenized segmet should be split by a space
        """
        tokenized_segment = self._processor.process(segment)
        if split:
            return tokenized_segment.split(" ")
        return tokenized_segment
コード例 #14
0
class BytePairEncoderSegment(object):
    """
    Applies a trained BPE model to individual segments.
    """
    def __init__(self, bpe_model_path, vocab_path=None):
        """
        @param bpe_model_path full path to BPE model
        @param vocab_path optional path to vocabulary file
        """
        arguments = [
            '-c %s' % bpe_model_path
        ]
        if vocab_path is not None:
            arguments.extend([
                '--vocabulary %s' % vocab_path,
                '--vocabulary-threshold %d' % C.BPE_VOCAB_THRESHOLD
            ])

        # the subword script apply_bpe.py needs to be run in a Python 3 environment,
        # a constant is used to avoid version problems
        self._processor = ExternalProcessor(
            command=" ".join([C.PYTHON3] + [C.SUBWORD_NMT_APPLY] + arguments),
            stream_stderr=False,
            trailing_output=False,
            shell=False
        )

    def close(self):
        """
        Deletes reference to obsolete objects.
        """
        del self._processor

    def encode_segment(self, segment):
        """
        Encodes a single @param segment by applying a trained BPE model.
        """
        encoded_segment = self._processor.process(segment)
        return encoded_segment
コード例 #15
0
class Truecaser(object):
    """
    Creates a truecaser which truecases sentences on-the-fly, i.e., allowing
    interaction with a Moses truecaser process kept in memory.
    """
    def __init__(self, path_model):
        """
        @param path_model path to truecasing model trained in `mtrain`
        """
        arguments = [
            '-model %s' % path_model,
            '-b'  #disable Perl buffering
        ]

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES_TRUECASER] + arguments))

    def close(self):
        """
        Deletes object to free up memory.
        """
        del self._processor

    def truecase_segment(self, segment):
        """
        Truecases a single segment.
        """
        return self._processor.process(segment)

    def truecase_tokens(self, tokens, split=True):
        """
        Truecases a list of tokens.
        """
        truecased_string = self.truecase_segment(" ".join(tokens))
        if split:
            return truecased_string.split(" ")
        return truecased_string
コード例 #16
0
ファイル: engine.py プロジェクト: kuserich/mtrain
class EngineMoses(object):
    """
    Starts a translation engine process for moses backend and keep it running.
    """
    def __init__(self, path_moses_ini, report_alignment=False, report_segmentation=False):
        """
        @param path_moses_ini path to Moses configuration file
        @param report_alignment whether Moses should report word alignments
        @param report_segmentation whether Moses should report how the translation
            is made up of phrases
        """
        self._path_moses_ini = path_moses_ini
        self._report_alignment = report_alignment
        self._report_segmentation = report_segmentation

        arguments = [
            '-f %s' % self._path_moses_ini,
            '-minphr-memory', # compact phrase table
            '-minlexr-memory', # compact reordering table
            '-v 0', # as quiet as possible
            '-xml-input constraint' # allow forced translations and zones
        ]
        trailing_output = False

        if self._report_alignment:
            arguments.append('-print-alignment-info')
            trailing_output = True
        if self._report_segmentation:
            arguments.append('-report-segmentation')

        self._processor = ExternalProcessor(
            command=" ".join([C.MOSES] + arguments),
            stream_stderr=True,
            trailing_output=trailing_output
        )

    def close(self):
        del self._processor

    def _extract_alignment(self, alignment_string):
        """
        Transforms a word alignment string into an easily
            accessible dictionary {source: [target, ...], ...}
        @param alignment_string the exact string returned by Moses
            that contains alignment information
        """
        alignments = defaultdict(list)

        for alignment in alignment_string.strip().split(" "):
            source, target = [int(string) for string in alignment.split("-")]
            alignments[source].append(target)

        return alignments

    def _separate_tokens_from_segmentation(self, translation):
        """
        Transform phrase segmentation strings into easily
            accessible dictionary:
            {(source start, source end): (target start, target end), ...}
        @param translation a translation string returned by Moses that does not
            contain word alignments anymore, but phrase segmentation is still
            interspersed
        """
        tokens = []
        segmentation = {}
        current_phrase_indexes = []
        current_index = 0

        for string in translation.split(" "):
            if '|' in string:
                current_segmentation = string.replace('|', '').split("-")
                if len(current_phrase_indexes) == 1:
                    current_phrase_indexes.append(current_phrase_indexes[0]) # duplicate single index

                current_key = tuple(int(index) for index in current_segmentation)
                segmentation[current_key] = tuple(int(index) for index in current_phrase_indexes)

                current_phrase_indexes = []
            else:
                if len(current_phrase_indexes) >= 2:
                    current_phrase_indexes.pop()
                current_phrase_indexes.append(str(current_index))
                tokens.append(string)
                current_index += 1

        return tokens, segmentation

    def _untangle_translation(self, translation):
        """
        Separates the actual translation from reported segmentation
            and word alignments. Changes slightly the segmentation info
            by adding information about the source tokens.
        @param translation the exact string returned by a Moses engine
        """
        if self._report_alignment:
            alignment = []
            parts = translation.split('|||')
            translation = parts[0].strip() # update translation to remove alignment info

            alignment = self._extract_alignment(parts[1])

        if self._report_segmentation:
            tokens, segmentation = self._separate_tokens_from_segmentation(translation)
            translation = " ".join(tokens) # update translation to only contain actual tokens

        return (
            translation,
            alignment if self._report_alignment else None,
            segmentation if self._report_segmentation else None
        )

    def translate_segment(self, segment):
        """
        Translates a single input segment, @param segment.

        @return a TranslatedSegment object with a translation and,
        optionally, alignments and/or segmentation info
        """
        translation = self._processor.process(segment)
        translation, alignment, segmentation = self._untangle_translation(translation)

        return TranslatedSegment(
            translated_segment=translation,
            alignment=alignment,
            segmentation=segmentation
        )

    def translate_file(self, input_path, output_path):
        """
        Translates an entire file.

        @param input_path path to temp file with preprocessed input segments
        @param output_path path to temp file were raw translations should be written
        """

        raise NotImplementedError