Exemple #1
0
    def _setup_normalizer(self):
        try:
            from mosestokenizer import MosesPunctuationNormalizer

            self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang)
        except ImportError:
            warnings.warn("Recommended: pip install mosestokenizer")
            self.punc_normalizer = lambda x: x
Exemple #2
0
def normalize_punct(text, lang):
    """ Normalize punctuation of a given string
    Punctuation normalizer:
    https://bitbucket.org/luismsgomes/mosestokenizer
    """
    from mosestokenizer import MosesPunctuationNormalizer
    with MosesPunctuationNormalizer(lang) as normalize:
        return normalize(text)
Exemple #3
0
 def __init__(self, lm_type:LMType , language:str):
     """
         lm_type: LMType
         language: language code
     """
     
     self.language=language
     self.tokenizer=MosesTokenizer(self.language)
     self.normalizer=MosesPunctuationNormalizer(self.language)
     self.splitter=MosesSentenceSplitter(self.language, more=False)
     self.type=lm_type
    def __init__(
        self,
        vocab=None,
        source_spm=None,
        target_spm=None,
        source_lang=None,
        target_lang=None,
        unk_token="<unk>",
        eos_token="</s>",
        pad_token="<pad>",
        max_len=512,
    ):

        super().__init__(
            # bos_token=bos_token,
            max_len=max_len,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
        )
        self.encoder = load_json(vocab)
        if self.unk_token not in self.encoder:
            raise KeyError("<unk> token must be in vocab")
        assert self.pad_token in self.encoder
        self.decoder = {v: k for k, v in self.encoder.items()}

        self.source_lang = source_lang
        self.target_lang = target_lang

        # load SentencePiece model for pre-processing
        self.paths = {}

        self.spm_source = sentencepiece.SentencePieceProcessor()
        self.spm_source.Load(source_spm)

        self.spm_target = sentencepiece.SentencePieceProcessor()
        self.spm_target.Load(target_spm)

        # Multilingual target side: default to using first supported language code.
        self.supported_language_codes: list = [
            k for k in self.encoder if k.startswith(">>") and k.endswith("<<")
        ]
        self.tgt_lang_id = None  # will not be used unless it is set through prepare_translation_batch

        # Note(SS): sentence_splitter would require lots of book-keeping.
        try:
            from mosestokenizer import MosesPunctuationNormalizer

            self.punc_normalizer = MosesPunctuationNormalizer(source_lang)
        except ImportError:
            warnings.warn("Recommended: pip install mosestokenizer")
            self.punc_normalizer = lambda x: x
Exemple #5
0
    def __init__(self,
                 srclang,
                 targetlang,
                 sourcebpe=None,
                 targetbpe=None,
                 sourcespm=None,
                 targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences = []
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)

        if self.bpe_source:
            self.detokenizer = MosesDetokenizer(targetlang)
Exemple #6
0
    def __init__(
        self,
        vocab=None,
        source_spm=None,
        target_spm=None,
        source_lang=None,
        target_lang=None,
        unk_token="<unk>",
        eos_token="</s>",
        pad_token="<pad>",
        max_len=512,
    ):

        super().__init__(
            # bos_token=bos_token,
            max_len=max_len,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
        )
        self.encoder = load_json(vocab)
        assert self.pad_token in self.encoder
        self.decoder = {v: k for k, v in self.encoder.items()}

        self.source_lang = source_lang
        self.target_lang = target_lang

        # load SentencePiece model for pre-processing
        self.paths = {}

        self.spm_source = sentencepiece.SentencePieceProcessor()
        self.spm_source.Load(source_spm)

        self.spm_target = sentencepiece.SentencePieceProcessor()
        self.spm_target.Load(target_spm)

        # Note(SS): splitter would require lots of book-keeping.
        # self.sentence_splitter = MosesSentenceSplitter(source_lang)
        try:
            from mosestokenizer import MosesPunctuationNormalizer

            self.punc_normalizer = MosesPunctuationNormalizer(source_lang)
        except ImportError:
            warnings.warn("Recommended: pip install mosestokenizer")
            self.punc_normalizer = lambda x: x
Exemple #7
0
def tokenizer_moses(text, column='comment_text'): #column for extracting from csv
    '''
    A proper wrapper for moses text preprocessing utilities,
    because they can't handle newlines
        text: string
        out: list
    '''
    result = []
    with MosesPunctuationNormalizer() as punct, MosesTokenizer('en') as tok:
        if column:
            texts = list(filter(None, text[column].lower().split('\n')))
        else:
            texts = text
        for t in texts:
            if len(t.strip()):
                norm = punct(t)
                tokens = tok(norm)
                result.extend(tokens)
    return result
Exemple #8
0
from mosestokenizer import MosesPunctuationNormalizer, MosesTokenizer
from operators.operator import Operator

MOSES_PREPROCESS = {
    'en': [MosesPunctuationNormalizer('en'),
           MosesTokenizer('en')],
    'fr': [MosesPunctuationNormalizer('fr'),
           MosesTokenizer('fr')],
}


class MosesTokenize(Operator):
    def preprocess(self, text, lang='en'):
        punct, tokenizer = MOSES_PREPROCESS[lang]
        s = punct(text)
        if isinstance(s, str):
            return ' '.join(tokenizer(s)), {'raw': text}
        else:
            return '', {'raw': text}

    def postprocess(self, text_tgt_matched, mapping_tgt_matched, lang,
                    mapping_src_ref):
        return text_tgt_matched
import numpy as np
from skimage.transform import resize

from mosestokenizer import (
        MosesTokenizer, MosesPunctuationNormalizer, MosesSentenceSplitter,
        MosesDetokenizer)

JSON_HEADER = {'Content-type': 'application/json'}

APP = Flask(__name__)
APP.sentiment_en_address = None
APP.sentiment_cs_address = None

EN_MOSES_TOKENIZER = MosesTokenizer("en")
CS_MOSES_TOKENIZER = MosesTokenizer("cs")
EN_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("en")
CS_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("cs")
EN_MOSES_SENT_SPLITTER = MosesSentenceSplitter("en")
EN_MOSES_DETOKENIZER = MosesDetokenizer("en")
CS_MOSES_DETOKENIZER = MosesDetokenizer("cs")

ALPHANUMERIC_CHARSET = set(
    chr(i) for i in range(sys.maxunicode)
    if (unicodedata.category(chr(i)).startswith("L")
        or unicodedata.category(chr(i)).startswith("N")))


def root_dir():  # pragma: no cover
        return os.path.abspath(os.path.dirname(__file__))