class MosesText(markovify.NewlineText): mt = sacremoses.MosesTokenizer() md = sacremoses.MosesDetokenizer() def word_join(self, words): return self.md.detokenize(words) def word_split(self, text): return self.mt.tokenize(text)
def __init__(self, config_file): with open(config_file) as f: self.__dict__.update(yaml.safe_load(f)) assert self.type in {"cn2en", "en2cn"} codes = codecs.open(self.codes_file, encoding='utf-8') cur_path = os.path.dirname(os.path.realpath(__file__)) self.tokenizer = BPE(codes) if self.type == "en2cn": # pre_process: normalize, tokenize, subEntity,to_lower,bpe # post_process: delbpe,remove_space self.en_tokenizer = os.path.join(cur_path, self.en_tokenizer) self.en_normalize_punctuation = sacremoses.MosesPunctNormalizer( lang="en") self.en_tokenizer = sacremoses.MosesTokenizer( lang='en', custom_nonbreaking_prefixes_file=self.en_tokenizer) elif self.type == "cn2en": # pre_process: tokenize, bpe # post_process: delbpe,detruecase,detokenize self.detruecase = sacremoses.MosesDetruecaser() self.detokenize = sacremoses.MosesDetokenizer(lang='en') self.client = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=3600), connector=aiohttp.TCPConnector(limit=sys.maxsize, limit_per_host=sys.maxsize)) self.cn2en_trans_dict = slang_dict(self.trans_dict_file) self.chinese_char_pattern = re.compile(u"[\u4E00-\u9FA5]+") self.stops = re.compile(u"[.!?!?。。]+")
def __init__(self, vocab_address=None, bpe_code_address=None, src_en='en', tgt_de='de', vocab_pad=8, isolator='@@'): """ Constructor for the Tokenizer class. Args: vocab_address: vocabulary address. bpe_code_address: path to the file with bpe codes. vocab_pad: pads vocabulary to a multiple of 'vocab_pad' tokens. isolator: tokenization isolator. """ self.padding_index = 0 self.unk_index = 1 self.bos_index = 2 self.eos_index = 3 self.pad_word = '<pad>' self.unk_word = '<unk>' self.bos_word = '<s>' self.eos_word = r'<\s>' self.isolator = isolator self.init_bpe(bpe_code_address) self.vocab_establist(vocab_address, vocab_pad) self.sacremoses_tokenizer = sacremoses.MosesTokenizer(src_en) self.sacremoses_detokenizer = sacremoses.MosesDetokenizer(tgt_de)
def init_word_detokenizers(main, lang): if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']: # Sacremoses lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_detokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_detokenizer_{lang}'] = sacremoses.MosesDetokenizer(lang = lang_sacremoses)
def postprocess(outputs, truecaser=None, tokenizer=None): if truecaser: tr = sacremoses.MosesDetruecaser() output = [tr.detruecase(hyp, return_str=True) for hyp in outputs] if tokenizer: tk = sacremoses.MosesDetokenizer(tokenizer) outputs = [ tk.detokenize(hyp.split(), return_str=True) for hyp in outputs ] return outputs
def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None): self._lang = lang self._vocab = vocab if lang == 'zh': warnings.warn( 'You may not use MosesTokenizer for Chinese sentences because it is ' 'not accurate. Try to use JiebaTokenizer. You may also tokenize the ' 'chinese sentence to characters and learn a BPE.') self._tokenizer = sacremoses.MosesTokenizer(lang=lang) self._detokenizer = sacremoses.MosesDetokenizer(lang=lang) # Here, we need to warm-up the tokenizer to compile the regex # This will boost the performance in MacOS # For benchmarking results, see # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099 self._warmup()
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("files", nargs="*", help="input files") args = parser.parse_args() detok = sacremoses.MosesDetokenizer() for line in fileinput.input(args.files, openhook=fileinput.hook_compressed): print( detok.detokenize(line.strip().split(" ")).replace( " @", "").replace("@ ", "").replace(" =", "=").replace("= ", "=").replace(" – ", "–"))
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('files', nargs='*', help='input files') args = parser.parse_args() detok = sacremoses.MosesDetokenizer() for line in fileinput.input(args.files, openhook=fileinput.hook_compressed): print( detok.detokenize(line.strip().split(' ')).replace( ' @', '').replace('@ ', '').replace(' =', '=').replace('= ', '=').replace(' – ', '–'))
def calc_detokenized_bleu(hyp, refs, tgt_lang="fr", unescape=False): """Calculate detokenized BLEU using raw (tokenized) texts. The input tokenized texts will be detokenized using `sacremoses`. References: https://github.com/pytorch/fairseq/blob/409032596bd80240f7fbc833b5d37485dee85b0e/fairseq/tasks/translation.py#L414 https://github.com/pytorch/fairseq/blob/409032596bd80240f7fbc833b5d37485dee85b0e/fairseq_cli/score.py#L79 Parameters ---------- hyp : list of str A list containing hypotheses for each source sentence. refs : list of list of str A list of lists of candidate reference translations. tgt_lang : str Target language. Used for detokenizer. unescape : False Set this to True if the training data was tokenized using `moses` with escaping, e.g., "'" gets turned into "'". Defaults to False since the data in this project is preprocessed with `--no-escape`. Returns: sacrebleu.metrics.bleu.BLEUScore Detokenized BLEU score. """ # Check for validity for ref in refs: assert len(ref) == len(hyp), ("Number of sentences in hypothesis and " f"reference does not match: {len(hyp)} " f"and {len(ref)}") # Make sure unknown words are escaped hyp = [hyp_sent.replace("<unk>", "<unk_hyp>") for hyp_sent in hyp] # Detoknize detokenizer = sacremoses.MosesDetokenizer(lang=tgt_lang) hyp = [detokenizer.detokenize(hyp_sent.split()) for hyp_sent in hyp] refs_detok = [] for ref in refs: ref = [ detokenizer.detokenize(ref_sent.split(), unescape=unescape) for ref_sent in ref ] refs_detok.append(ref) return sacrebleu.corpus_bleu(hyp, refs_detok, tokenize="none")
def __init__(self): self.question_generator = xlingqg.QuestionGenerator() self.translator = xlingqg.Translator() self.tokenizer = sacremoses.MosesTokenizer() self.detokenizer = sacremoses.MosesDetokenizer() self.answer_encoder = AnswerEncoder()
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file: str = None, never_split=None, unk_token="<unk>", eos_token="<eos>", additional_special_tokens=["<formula>"], language="en", **kwargs): super().__init__(unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern( ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) # This try... catch... is not beautiful but honestly this tokenizer was not made to be used # in a library like ours, at all. try: vocab_dict = None if pretrained_vocab_file is not None: # Priority on pickle files (support PyTorch and TF) with open(pretrained_vocab_file, "rb") as f: vocab_dict = pickle.load(f) # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed. # We therefore load it with torch, if it's available. if type(vocab_dict) == int: if not is_torch_available(): raise ImportError( "Not trying to load dict with PyTorch as you need to install pytorch to load " "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) vocab_dict = torch.load(pretrained_vocab_file) if vocab_dict is not None: for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() except Exception as e: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format( pretrained_vocab_file)) from e if vocab_file is not None: self.build_vocab()
def moses_detokenize(self, tokens, lang): if lang not in self.cache_moses_tokenizer: moses_detokenizer = sm.MosesDetokenizer(lang=self.tgt_lang) self.cache_moses_detokenizer[lang] = moses_detokenizer return self.cache_moses_detokenizer[lang].detokenize(tokens)
def wordless_word_detokenize(main, tokens, lang, word_detokenizer='default'): sentence_start = 0 sentences = [] text = '' if lang not in main.settings_global['word_detokenizers']: lang = 'other' if word_detokenizer == 'default': word_detokenizer = main.settings_custom['word_detokenization'][ 'word_detokenizers'][lang] for i, token in enumerate(tokens): if type(token ) == wordless_text.Wordless_Token and token.sentence_ending: sentences.append(tokens[sentence_start:i + 1]) sentence_start = i + 1 elif i == len(tokens) - 1: sentences.append(tokens[sentence_start:]) # English & Other Languages if word_detokenizer == main.tr('NLTK - Penn Treebank Detokenizer'): treebank_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer() for sentence in sentences: text += treebank_detokenizer.tokenize(tokens) elif word_detokenizer == main.tr('Sacremoses - Moses Detokenizer'): moses_detokenizer = sacremoses.MosesDetokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: text += moses_detokenizer.detokenize(sentence) # Chinese elif word_detokenizer == main.tr('Wordless - Chinese Word Detokenizer'): non_cjk_start = 0 for i, token in enumerate(tokens): if i >= non_cjk_start: if (wordless_checking_unicode.has_han(token) or all(map(str.isnumeric, token))): text += token non_cjk_start += 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='eng') non_cjk_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_han( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break elif word_detokenizer == main.tr('Wordless - Japanese Word Detokenizer'): non_cjk_start = 0 for i, token in enumerate(tokens): if i < non_cjk_start: continue if (wordless_checking_unicode.has_han(token) or wordless_checking_unicode.has_kana(token) or all(map(str.isnumeric, token))): text += token non_cjk_start = i + 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='eng') non_cjk_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_han( tokens[i + j + 1]) or wordless_checking_unicode.has_kana( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break # Thai elif word_detokenizer in main.tr('Wordless - Thai Word Detokenizer'): non_thai_start = 0 for i, token in enumerate(tokens): if i < non_thai_start: continue if wordless_checking_unicode.has_thai(token): if type(token) == wordless_text.Wordless_Token: text += token + token.boundary else: text += token non_thai_start = i + 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_thai_start:i + j + 1], lang='eng') non_thai_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_thai( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_thai_start:i + j + 1], lang='other') non_thai_start = i + j + 1 break # Tibetan elif word_detokenizer == main.tr('Wordless - Tibetan Word Detokenizer'): non_tibetan_start = 0 for i, token in enumerate(tokens): if i < non_tibetan_start: continue if wordless_checking_unicode.has_tibetan(token): # Check for Tibetan Mark Shad # See: https://w3c.github.io/tlreq/#section_breaks if i > 0 and token[0] == '།': text += token else: text += token non_tibetan_start = i + 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_tibetan_start:i + j + 1], lang='eng') non_tibetan_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_tibetan( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_tibetan_start:i + j + 1], lang='other') non_tibetan_start = i + j + 1 break return re.sub(r'\s{2,}', ' ', text)
def __init__(self, lang): self.detokenizer = sacremoses.MosesDetokenizer(lang)
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="<unk>", eos_token="<eos>", additional_special_tokens=["<formula>"], language="en", **kwargs): super().__init__(unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern( ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) try: if pretrained_vocab_file is not None: # Hack because, honestly this tokenizer was not made to be used # in a library like ours, at all. vocab_dict = torch.load(pretrained_vocab_file) for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value if vocab_file is not None: self.build_vocab() except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format( pretrained_vocab_file)) if vocab_file is not None: self.build_vocab()
def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk="<unk>", eos="<eos>", additional_special_tokens=["<formula>"], language="en", **kw, ): super().__init__( special=special, min_freq=min_freq, max_size=max_size, lower_case=lower_case, delimiter=delimiter, vocab_file=vocab_file, pretrained_vocab_file=pretrained_vocab_file, never_split=never_split, unk=unk, eos=eos, additional_special_tokens=additional_special_tokens, language=language, **kw, ) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( rf"[^\s][{self.punctuation_symbols}]" ) self.punctuation_with_space_around_pattern = ( self._compile_space_around_punctuation_pattern() ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) try: vocab_dict = None if pretrained_vocab_file is not None: with open(pretrained_vocab_file, "rb") as f: vocab_dict = pickle.load(f) if type(vocab_dict) == int: if not is_torch_available(): raise ImportError( "Not trying to load dict with PyTorch as you need to install pytorch to load " "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) vocab_dict = torch.load(pretrained_vocab_file) if vocab_dict is not None: for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() except Exception as e: raise ValueError( f"Unable to parse file {pretrained_vocab_file}. Unknown format. " "If you tried to load a model saved through TokenizerFast, " "please note they are not compatible." ) from e if vocab_file is not None: self.build_vocab()
def init_moses(self, lang): self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src']) self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])