def _setup_normalizer(self): try: from mosestokenizer import MosesPunctuationNormalizer self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang) except ImportError: warnings.warn("Recommended: pip install mosestokenizer") self.punc_normalizer = lambda x: x
def normalize_punct(text, lang): """ Normalize punctuation of a given string Punctuation normalizer: https://bitbucket.org/luismsgomes/mosestokenizer """ from mosestokenizer import MosesPunctuationNormalizer with MosesPunctuationNormalizer(lang) as normalize: return normalize(text)
def __init__(self, lm_type:LMType , language:str): """ lm_type: LMType language: language code """ self.language=language self.tokenizer=MosesTokenizer(self.language) self.normalizer=MosesPunctuationNormalizer(self.language) self.splitter=MosesSentenceSplitter(self.language, more=False) self.type=lm_type
def __init__( self, vocab=None, source_spm=None, target_spm=None, source_lang=None, target_lang=None, unk_token="<unk>", eos_token="</s>", pad_token="<pad>", max_len=512, ): super().__init__( # bos_token=bos_token, max_len=max_len, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, ) self.encoder = load_json(vocab) if self.unk_token not in self.encoder: raise KeyError("<unk> token must be in vocab") assert self.pad_token in self.encoder self.decoder = {v: k for k, v in self.encoder.items()} self.source_lang = source_lang self.target_lang = target_lang # load SentencePiece model for pre-processing self.paths = {} self.spm_source = sentencepiece.SentencePieceProcessor() self.spm_source.Load(source_spm) self.spm_target = sentencepiece.SentencePieceProcessor() self.spm_target.Load(target_spm) # Multilingual target side: default to using first supported language code. self.supported_language_codes: list = [ k for k in self.encoder if k.startswith(">>") and k.endswith("<<") ] self.tgt_lang_id = None # will not be used unless it is set through prepare_translation_batch # Note(SS): sentence_splitter would require lots of book-keeping. try: from mosestokenizer import MosesPunctuationNormalizer self.punc_normalizer = MosesPunctuationNormalizer(source_lang) except ImportError: warnings.warn("Recommended: pip install mosestokenizer") self.punc_normalizer = lambda x: x
def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None, sourcespm=None, targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences = [] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) if self.bpe_source: self.detokenizer = MosesDetokenizer(targetlang)
def __init__( self, vocab=None, source_spm=None, target_spm=None, source_lang=None, target_lang=None, unk_token="<unk>", eos_token="</s>", pad_token="<pad>", max_len=512, ): super().__init__( # bos_token=bos_token, max_len=max_len, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, ) self.encoder = load_json(vocab) assert self.pad_token in self.encoder self.decoder = {v: k for k, v in self.encoder.items()} self.source_lang = source_lang self.target_lang = target_lang # load SentencePiece model for pre-processing self.paths = {} self.spm_source = sentencepiece.SentencePieceProcessor() self.spm_source.Load(source_spm) self.spm_target = sentencepiece.SentencePieceProcessor() self.spm_target.Load(target_spm) # Note(SS): splitter would require lots of book-keeping. # self.sentence_splitter = MosesSentenceSplitter(source_lang) try: from mosestokenizer import MosesPunctuationNormalizer self.punc_normalizer = MosesPunctuationNormalizer(source_lang) except ImportError: warnings.warn("Recommended: pip install mosestokenizer") self.punc_normalizer = lambda x: x
def tokenizer_moses(text, column='comment_text'): #column for extracting from csv ''' A proper wrapper for moses text preprocessing utilities, because they can't handle newlines text: string out: list ''' result = [] with MosesPunctuationNormalizer() as punct, MosesTokenizer('en') as tok: if column: texts = list(filter(None, text[column].lower().split('\n'))) else: texts = text for t in texts: if len(t.strip()): norm = punct(t) tokens = tok(norm) result.extend(tokens) return result
from mosestokenizer import MosesPunctuationNormalizer, MosesTokenizer from operators.operator import Operator MOSES_PREPROCESS = { 'en': [MosesPunctuationNormalizer('en'), MosesTokenizer('en')], 'fr': [MosesPunctuationNormalizer('fr'), MosesTokenizer('fr')], } class MosesTokenize(Operator): def preprocess(self, text, lang='en'): punct, tokenizer = MOSES_PREPROCESS[lang] s = punct(text) if isinstance(s, str): return ' '.join(tokenizer(s)), {'raw': text} else: return '', {'raw': text} def postprocess(self, text_tgt_matched, mapping_tgt_matched, lang, mapping_src_ref): return text_tgt_matched
import numpy as np from skimage.transform import resize from mosestokenizer import ( MosesTokenizer, MosesPunctuationNormalizer, MosesSentenceSplitter, MosesDetokenizer) JSON_HEADER = {'Content-type': 'application/json'} APP = Flask(__name__) APP.sentiment_en_address = None APP.sentiment_cs_address = None EN_MOSES_TOKENIZER = MosesTokenizer("en") CS_MOSES_TOKENIZER = MosesTokenizer("cs") EN_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("en") CS_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("cs") EN_MOSES_SENT_SPLITTER = MosesSentenceSplitter("en") EN_MOSES_DETOKENIZER = MosesDetokenizer("en") CS_MOSES_DETOKENIZER = MosesDetokenizer("cs") ALPHANUMERIC_CHARSET = set( chr(i) for i in range(sys.maxunicode) if (unicodedata.category(chr(i)).startswith("L") or unicodedata.category(chr(i)).startswith("N"))) def root_dir(): # pragma: no cover return os.path.abspath(os.path.dirname(__file__))