def __init__(self, lang_code, protect=False, protected_patterns_path=None, escape=True): """ @param lang_code language identifier @param protect whether the tokenizer should respect patterns that should not be tokenized @param protected_patterns_path path to file with protected patterns @param escape whether characters that break the Moses decoder should be escaped """ arguments = [ '-l %s' % lang_code, '-b', # disable Perl buffering '-q', # don't report version '-a', # aggressive mode ] if protect: arguments.append( '-protected %s' % protected_patterns_path, # protect e.g. inline XML, URLs and email ) if not escape: arguments.append( '-no-escape' # do not escape reserved characters in Moses ) self._processor = ExternalProcessor( command=" ".join([C.MOSES_TOKENIZER] + arguments) )
def __init__(self, path_moses_ini, report_alignment=False, report_segmentation=False): """ @param path_moses_ini path to Moses configuration file @param report_alignment whether Moses should report word alignments @param report_segmentation whether Moses should report how the translation is made up of phrases """ self._path_moses_ini = path_moses_ini self._report_alignment = report_alignment self._report_segmentation = report_segmentation arguments = [ '-f %s' % self._path_moses_ini, '-minphr-memory', # compact phrase table '-minlexr-memory', # compact reordering table '-v 0', # as quiet as possible '-xml-input constraint' # allow forced translations and zones ] trailing_output = False if self._report_alignment: arguments.append('-print-alignment-info') trailing_output = True if self._report_segmentation: arguments.append('-report-segmentation') self._processor = ExternalProcessor( command=" ".join([C.MOSES] + arguments), stream_stderr=True, trailing_output=trailing_output )
def __init__(self, path_moses_ini): arguments = [ '-f %s' % path_moses_ini, '-dl 0', '-minphr-memory', '-v 0', ] self._processor = ExternalProcessor(command=" ".join([MOSES] + arguments))
def __init__(self): """ Detruecaser that is a script, no model training. """ arguments = [ '-b' # disable Perl buffering ] self._processor = ExternalProcessor( command=" ".join([C.MOSES_DETRUECASER] + arguments))
def __init__(self, path_model): """ @param path_model path to truecasing model trained in `mtrain` """ arguments = [ '-model %s' % path_model, '-b' #disable Perl buffering ] self._processor = ExternalProcessor( command=" ".join([C.MOSES_TRUECASER] + arguments))
def __init__(self, lang_code): """ @param lang_code language identifier """ arguments = [ '-l %s' % lang_code, '-b', # disable Perl buffering '-q', # don't report version ] # no aggressive mode '-a' for normalizer self._processor = ExternalProcessor( command=" ".join([C.MOSES_NORMALIZER] + arguments) )
class Detokenizer(object): """ Creates a detokenizer which detokenizes lists of tokens on-the-fly, i.e., allowing interaction with a Moses detokenizer process kept in memory. """ def __init__(self, lang_code, uppercase_first_letter=False): """ @param lang_code language identifier @param uppercase_first_letter whether or not to uppercase the first letter in the detokenized output. """ arguments = [ '-l %s' % lang_code, '-b', # disable Perl buffering '-q', # don't report version ] if uppercase_first_letter: arguments.append('-u') self._processor = ExternalProcessor( command=" ".join([C.MOSES_DETOKENIZER] + arguments), stream_stderr=True ) def close(self): del self._processor def detokenize(self, tokens): """ Detokenizes a list of @param tokens into a segment """ return self._processor.process(" ".join(tokens))
class Recaser(object): ''' Creates a recaser which recases sentences on-the-fly, i.e., allowing interaction with a Moses recaser engine kept in memory. ''' def __init__(self, path_moses_ini): arguments = [ '-f %s' % path_moses_ini, '-dl 0', '-minphr-memory', '-v 0', ] self._processor = ExternalProcessor(command=" ".join([MOSES] + arguments)) def close(self): del self._processor def recase(self, segment): ''' Recases a single segment. ''' return self._processor.process(segment) def recase_tokens(self, tokens): ''' Recases a list of tokens. ''' return self.recase(" ".join(tokens)).split(" ")
class Normalizer(object): """ Creates a normalizer for processing segment by segment, allowing interaction with a normalizer process kept in memory. """ def __init__(self, lang_code): """ @param lang_code language identifier """ arguments = [ '-l %s' % lang_code, '-b', # disable Perl buffering '-q', # don't report version ] # no aggressive mode '-a' for normalizer self._processor = ExternalProcessor( command=" ".join([C.MOSES_NORMALIZER] + arguments) ) def close(self): del self._processor def normalize_punctuation(self, segment): """ Normalizes punctuation characters of a single @param segment. """ normalized_segment = self._processor.process(segment) return normalized_segment
class Detruecaser(object): """ Creates a detruecaser which detruecases sentences on-the-fly, i.e., allowing interaction with a Moses truecaser process kept in memory. """ def __init__(self): """ Detruecaser that is a script, no model training. """ arguments = [ '-b' # disable Perl buffering ] self._processor = ExternalProcessor( command=" ".join([C.MOSES_DETRUECASER] + arguments)) def close(self): del self._processor def detruecase_segment(self, segment): """ Detruecases a single segment. """ return self._processor.process(segment) def detruecase_tokens(self, tokens): """ Detruecases a list of tokens. """ detruecased_segment = self.detruecase_segment(" ".join(tokens)) return detruecased_segment.split(" ")
def __init__(self, lang_code, uppercase_first_letter=False): """ @param lang_code language identifier @param uppercase_first_letter whether or not to uppercase the first letter in the detokenized output. """ arguments = [ '-l %s' % lang_code, '-b', # disable Perl buffering '-q', # don't report version ] if uppercase_first_letter: arguments.append('-u') self._processor = ExternalProcessor( command=" ".join([C.MOSES_DETOKENIZER] + arguments), stream_stderr=True )
def __init__(self, bpe_model_path, vocab_path=None): """ @param bpe_model_path full path to BPE model @param vocab_path optional path to vocabulary file """ arguments = [ '-c %s' % bpe_model_path ] if vocab_path is not None: arguments.extend([ '--vocabulary %s' % vocab_path, '--vocabulary-threshold %d' % C.BPE_VOCAB_THRESHOLD ]) # the subword script apply_bpe.py needs to be run in a Python 3 environment, # a constant is used to avoid version problems self._processor = ExternalProcessor( command=" ".join([C.PYTHON3] + [C.SUBWORD_NMT_APPLY] + arguments), stream_stderr=False, trailing_output=False, shell=False )
class Tokenizer(object): """ Creates a tokenizer which tokenizes sentences on-the-fly, i.e., allowing interaction with a Moses tokenizer process kept in memory. """ def __init__(self, lang_code, protect=False, protected_patterns_path=None, escape=True): """ @param lang_code language identifier @param protect whether the tokenizer should respect patterns that should not be tokenized @param protected_patterns_path path to file with protected patterns @param escape whether characters that break the Moses decoder should be escaped """ arguments = [ '-l %s' % lang_code, '-b', # disable Perl buffering '-q', # don't report version '-a', # aggressive mode ] if protect: arguments.append( '-protected %s' % protected_patterns_path, # protect e.g. inline XML, URLs and email ) if not escape: arguments.append( '-no-escape' # do not escape reserved characters in Moses ) self._processor = ExternalProcessor( command=" ".join([C.MOSES_TOKENIZER] + arguments) ) def close(self): del self._processor def tokenize(self, segment, split=True): """ Tokenizes a single @param segment. @param split determines if a tokenized segmet should be split by a space """ tokenized_segment = self._processor.process(segment) if split: return tokenized_segment.split(" ") return tokenized_segment
class BytePairEncoderSegment(object): """ Applies a trained BPE model to individual segments. """ def __init__(self, bpe_model_path, vocab_path=None): """ @param bpe_model_path full path to BPE model @param vocab_path optional path to vocabulary file """ arguments = [ '-c %s' % bpe_model_path ] if vocab_path is not None: arguments.extend([ '--vocabulary %s' % vocab_path, '--vocabulary-threshold %d' % C.BPE_VOCAB_THRESHOLD ]) # the subword script apply_bpe.py needs to be run in a Python 3 environment, # a constant is used to avoid version problems self._processor = ExternalProcessor( command=" ".join([C.PYTHON3] + [C.SUBWORD_NMT_APPLY] + arguments), stream_stderr=False, trailing_output=False, shell=False ) def close(self): """ Deletes reference to obsolete objects. """ del self._processor def encode_segment(self, segment): """ Encodes a single @param segment by applying a trained BPE model. """ encoded_segment = self._processor.process(segment) return encoded_segment
class Truecaser(object): """ Creates a truecaser which truecases sentences on-the-fly, i.e., allowing interaction with a Moses truecaser process kept in memory. """ def __init__(self, path_model): """ @param path_model path to truecasing model trained in `mtrain` """ arguments = [ '-model %s' % path_model, '-b' #disable Perl buffering ] self._processor = ExternalProcessor( command=" ".join([C.MOSES_TRUECASER] + arguments)) def close(self): """ Deletes object to free up memory. """ del self._processor def truecase_segment(self, segment): """ Truecases a single segment. """ return self._processor.process(segment) def truecase_tokens(self, tokens, split=True): """ Truecases a list of tokens. """ truecased_string = self.truecase_segment(" ".join(tokens)) if split: return truecased_string.split(" ") return truecased_string
class EngineMoses(object): """ Starts a translation engine process for moses backend and keep it running. """ def __init__(self, path_moses_ini, report_alignment=False, report_segmentation=False): """ @param path_moses_ini path to Moses configuration file @param report_alignment whether Moses should report word alignments @param report_segmentation whether Moses should report how the translation is made up of phrases """ self._path_moses_ini = path_moses_ini self._report_alignment = report_alignment self._report_segmentation = report_segmentation arguments = [ '-f %s' % self._path_moses_ini, '-minphr-memory', # compact phrase table '-minlexr-memory', # compact reordering table '-v 0', # as quiet as possible '-xml-input constraint' # allow forced translations and zones ] trailing_output = False if self._report_alignment: arguments.append('-print-alignment-info') trailing_output = True if self._report_segmentation: arguments.append('-report-segmentation') self._processor = ExternalProcessor( command=" ".join([C.MOSES] + arguments), stream_stderr=True, trailing_output=trailing_output ) def close(self): del self._processor def _extract_alignment(self, alignment_string): """ Transforms a word alignment string into an easily accessible dictionary {source: [target, ...], ...} @param alignment_string the exact string returned by Moses that contains alignment information """ alignments = defaultdict(list) for alignment in alignment_string.strip().split(" "): source, target = [int(string) for string in alignment.split("-")] alignments[source].append(target) return alignments def _separate_tokens_from_segmentation(self, translation): """ Transform phrase segmentation strings into easily accessible dictionary: {(source start, source end): (target start, target end), ...} @param translation a translation string returned by Moses that does not contain word alignments anymore, but phrase segmentation is still interspersed """ tokens = [] segmentation = {} current_phrase_indexes = [] current_index = 0 for string in translation.split(" "): if '|' in string: current_segmentation = string.replace('|', '').split("-") if len(current_phrase_indexes) == 1: current_phrase_indexes.append(current_phrase_indexes[0]) # duplicate single index current_key = tuple(int(index) for index in current_segmentation) segmentation[current_key] = tuple(int(index) for index in current_phrase_indexes) current_phrase_indexes = [] else: if len(current_phrase_indexes) >= 2: current_phrase_indexes.pop() current_phrase_indexes.append(str(current_index)) tokens.append(string) current_index += 1 return tokens, segmentation def _untangle_translation(self, translation): """ Separates the actual translation from reported segmentation and word alignments. Changes slightly the segmentation info by adding information about the source tokens. @param translation the exact string returned by a Moses engine """ if self._report_alignment: alignment = [] parts = translation.split('|||') translation = parts[0].strip() # update translation to remove alignment info alignment = self._extract_alignment(parts[1]) if self._report_segmentation: tokens, segmentation = self._separate_tokens_from_segmentation(translation) translation = " ".join(tokens) # update translation to only contain actual tokens return ( translation, alignment if self._report_alignment else None, segmentation if self._report_segmentation else None ) def translate_segment(self, segment): """ Translates a single input segment, @param segment. @return a TranslatedSegment object with a translation and, optionally, alignments and/or segmentation info """ translation = self._processor.process(segment) translation, alignment, segmentation = self._untangle_translation(translation) return TranslatedSegment( translated_segment=translation, alignment=alignment, segmentation=segmentation ) def translate_file(self, input_path, output_path): """ Translates an entire file. @param input_path path to temp file with preprocessed input segments @param output_path path to temp file were raw translations should be written """ raise NotImplementedError