Esempio n. 1
0
def input_pipeline(sentence, lang, bpe=None):
    """
    1. 分词(zh)
    2. 转小写(en)
    3. tokenzie
    4. bpe
    """
    if lang == 'zh':
        seg = [term.word for term in HanLP.segment(sentence)]
        seg_str = ' '.join(seg)
        #print('分词后:', seg)
        mt = MosesTokenizer(lang='zh')
        tokenized_str = mt.tokenize(seg_str, return_str=True)
        #print('tokenize后;',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后:', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    elif lang == 'en':
        lower = sentence.lower()
        #print('小写后:'. lower)
        mt = MosesTokenizer(lang='en')
        tokenized_str = mt.tokenize(lower, return_str=True)
        #print('tokenize后;',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后:', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    else:
        raise Exception
Esempio n. 2
0
def clean(
    l1="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples",
    l2="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples"
):
    en_tok = MT(lang='en')
    it_tok = MT(lang='it')
    with open(l1, "r", encoding="utf-8") as en, open(l2, "r",
                                                     encoding="utf-8") as it:
        en_text = en.readlines()
        it_text = it.readlines()
    with open("STOCAZZO.en", "w+",
              encoding="utf-8") as cl_en, open("DAJE.it",
                                               "w+",
                                               encoding="utf-8") as cl_it:
        c = 0
        for line_en, line_it in zip(en_text, it_text):
            line_en = " ".join(en_tok.tokenize(line_en)).lower().replace(
                "'", "'").replace(""", '"')
            line_it = " ".join(it_tok.tokenize(line_it)).lower().replace(
                "'", "'").replace(""", '"')
            cl_en.write(line_en + "\n")
            cl_it.write(line_it + "\n")
            c += 1
            if c % 500 == 0:
                print("Processed {} sentences".format(c))
Esempio n. 3
0
def score(path_to_segmentation: str, path_to_reference: str) -> None:

    path_to_segmentation = Path(path_to_segmentation)
    path_to_reference = Path(path_to_reference)

    # init tokenizer and detokenizer
    mt, md = MosesTokenizer(lang="de"), MosesDetokenizer(lang="de")

    # extract the reference sentences from the xml file
    reference = []
    with open(path_to_reference, "r", encoding="utf-8") as f:
        for line in f.read().splitlines():
            if line[:4] == "<seg":
                reference.append(
                    line.split(">", maxsplit=1)[1].split("</seg>")[0])

    scores = {}
    for path_to_segmentation_file_i in path_to_segmentation.glob("own_*.xml"):

        max_segm_len = int(path_to_segmentation_file_i.stem.split("_")[-1])

        # extract generated translations from the xml file
        segm_translation = load_segm_file(path_to_segmentation_file_i)

        # detokenize (have to tokenize first with the python implementation of Moses)
        segm_translation = [
            md.detokenize(mt.tokenize(s)) for s in segm_translation
        ]

        assert len(reference) == len(segm_translation)

        # get bleu score
        bleu = sacrebleu.corpus_bleu(segm_translation, [reference])
        scores[max_segm_len] = bleu.score

    scores = dict(sorted(scores.items()))

    # do the same process for the original segmentation
    path_to_original_segmentation_file = path_to_segmentation / "original_segm.xml"
    original_segm_translation = load_segm_file(
        path_to_original_segmentation_file)
    original_segm_translation = [
        md.detokenize(mt.tokenize(s)) for s in original_segm_translation
    ]
    assert len(reference) == len(original_segm_translation)

    bleu = sacrebleu.corpus_bleu(original_segm_translation, [reference])
    scores["original"] = bleu.score

    for n, s in scores.items():
        print(f"{n}: {s} BLEU")
Esempio n. 4
0
class EnThTranslator:
    def __init__(self):
        self._tokenizer = MosesTokenizer("en")

        self._model_name = _EN_TH_MODEL_NAME

        _download_install(self._model_name)
        self._model = TransformerModel.from_pretrained(
            model_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "models",
            ),
            checkpoint_file="checkpoint.pt",
            data_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "vocab",
            ),
        )

    def translate(self, text: str) -> str:
        """
        Translate text from English to Thai

        :param str text: input text in source language
        :return: translated text in target language
        :rtype: str
        """
        tokens = " ".join(self._tokenizer.tokenize(text))
        translated = self._model.translate(tokens)
        return translated.replace(" ", "").replace("▁", " ").strip()
Esempio n. 5
0
def _moses_tokenize(text, lang):
    """ Tokenize a given string using moses tokenizer
    Tokenization: https://github.com/alvations/sacremoses
    """
    from sacremoses import MosesTokenizer
    mt = MosesTokenizer(lang)
    return [string_unescape(t) for t in mt.tokenize(text)]
def generate(corpus: Optional[str] = None, test: Optional[str] = None):
    moses = MosesTokenizer(lang='fr')

    pos_tagged = loadCorpusFromStr(corpus)
    grammar_str = FormatGrammarAsCFG(InductGrammar(pos_tagged))

    grammar = nltk.CFG.fromstring(grammar_str.split('\n'))

    parsed = []
    valid = []
    not_valide = []

    for s in test.split('$'):
        try:
            tagged_sent = [
                token[1]
                for token in tagger.tag(moses.tokenize(s, escape=False))
            ]
            parsed = parse(tagged_sent, grammar)
            if parsed != None:
                valid.append((s, str(parsed)))
            else:
                not_valide.append(s)
        except:
            not_valide.append(s)

    return {
        "grammar": grammar_str,
        "test_results": {
            "valide": valid,
            "not_valide": not_valide
        }
    }
Esempio n. 7
0
class SacreMosesTokenizer(object):
    def __init__(self):
        try:
            from sacremoses import MosesTokenizer
            self._tokenizer = MosesTokenizer()
        except (ImportError, TypeError) as err:
            print('sacremoses is not installed. '
                  'To install sacremoses, use pip install -U sacremoses'
                  ' Now try NLTKMosesTokenizer using NLTK ...')
            raise

    def __call__(self, sample, return_str=False):
        """
        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str, escape=False)
Esempio n. 8
0
class Tokenizer:
    def __init__(self, command=None, l="en"):
        if command:
            self.tokenizer = ToolWrapper(command.split(' '))
            self.external = True
            self.spm = command.find('spm_encode') > -1
        else:
            self.tokenizer = MosesTokenizer(lang=l)
            self.external = False
            self.spm = False

    def tokenize(self, text):
        if self.external:
            self.tokenizer.writeline(text.rstrip('\n'))
            return ([
                no_escaping(t)
                for t in self.tokenizer.readline().rstrip('\n').split()
            ])
        else:
            return self.tokenizer.tokenize(text, escape=False)

    def detokenize(self, text):
        if self.spm:
            return ''.join(text).replace('\u2581', ' ')
        else:
            return ' '.join(text)

    def close(self):
        if self.external:
            try:
                self.tokenizer.close()
            except:
                return
Esempio n. 9
0
def get_tokenized_review_list():
    mt = MosesTokenizer()
    reviews = get_reviews()[0]
    tokenized_list = [
        mt.tokenize(review_text, escape=False) for review_text in reviews
    ]
    return (tokenized_list, reviews[1])
Esempio n. 10
0
class MosesPreprocessingFunc():

    def __init__(self, lang: str):
        self.mt = MosesTokenizer(lang)

    def __call__(self, t: str) -> str:
        return self.mt.tokenize(t, return_str=True, escape=True)
def build_vocab(num):
    vocab_file = f"vocab_{num}.txt"
    vocab_list = list()
    tokenizer = MosesTokenizer(lang='en')

    i = 0
    for file in os.listdir("dataset/train/body"):
        filename = os.fsdecode(file)

        with open(f"dataset/train/body/{filename}", 'r',
                  encoding='utf-8') as in_file:
            corpus_lines = in_file.readlines()
            corpus_lines = tokenizer.tokenize(corpus_lines)

            for line in corpus_lines:
                for word in line.split():
                    if word.lower() not in vocab_list:
                        vocab_list.append(word.lower())

        i += 1
        if i >= 1000:
            break

    with open(vocab_file, 'w', encoding='utf-8') as out_file:
        for word in vocab_list:
            out_file.write(f"{word}\n")
Esempio n. 12
0
class PyMosesTokenizer(GenericTokenizer):
    """
    The call to standard moses tokenizer
    """
    def __init__(self, lang, lowercase):
        self.mpn = MosesPunctNormalizer()
        self.tokenizer = MosesTokenizer(lang=lang)
        self.detokenizer = MosesDetokenizer(lang=lang)
        self.lowercase = lowercase
        self.lang = lang

    def tokenize(self, text):
        return self.tokenizer.tokenize(
            self.mpn.normalize(text.lower() if self.lowercase else text))

    def detokenize(self, tokenized_list):
        temp_result = ""
        t_list_len = len(tokenized_list)
        for t_ind, token in enumerate(tokenized_list):
            apos_cnd = token == "&apos;" and t_ind < t_list_len - 1 and tokenized_list[
                t_ind + 1] == "s"
            if apos_cnd or token == "/":
                temp_result = temp_result.strip() + token
            else:
                temp_result += token + " "
        f_result = self.detokenizer.detokenize(temp_result.strip().split())
        if len(f_result
               ) > 3 and f_result[-3] in string.punctuation and f_result[
                   -2] == " " and f_result[-1] == "\"":
            f_result = f_result[:-2] + f_result[-1]
        return f_result

    @property
    def model_name(self):
        return "Moses"
Esempio n. 13
0
class MosesTokenizer(object):

    @staticmethod
    def add_args(parser):
        # fmt: off
        parser.add_argument('--moses-source-lang', default='en', metavar='SRC',
                            help='source language')
        parser.add_argument('--moses-target-lang', default='en', metavar='TARGET',
                            help='target language')
        parser.add_argument('--moses-no-dash-splits', action='store_true', default=False,
                            help='don\'t apply dash split rules')
        parser.add_argument('--moses-no-escape', action='store_true', default=False,
                            help='don\'t perform HTML escaping on apostrophy, quotes, etc.')
        # fmt: on

    def __init__(self, args):
        self.args = args
        try:
            from sacremoses import MosesTokenizer, MosesDetokenizer
            self.tok = MosesTokenizer(args.moses_source_lang)
            self.detok = MosesDetokenizer(args.moses_target_lang)
        except ImportError:
            raise ImportError('Please install Moses tokenizer with: pip install sacremoses')

    def encode(self, x: str) -> str:
        return self.tok.tokenize(
            x,
            aggressive_dash_splits=(not self.args.moses_no_dash_splits),
            return_str=True,
            escape=(not self.args.moses_no_escape),
        )

    def decode(self, x: str) -> str:
        return self.detok.detokenize(x.split())
Esempio n. 14
0
class MosesTokenizer(Tokenizer):
    def __init__(self,
                 language,
                 glossaries=None,
                 aggressive_dash_splits=True,
                 escape=False):
        super(MosesTokenizer, self).__init__(language=language,
                                             glossaries=glossaries)
        self._aggressive_dash_splits = aggressive_dash_splits
        self._escape = escape
        try:
            from sacremoses import MosesDetokenizer as MDetok
            from sacremoses import MosesTokenizer as MTok
            self._tok = MTok(lang=self.language)
            self._detok = MDetok(lang=self.language)
        except ImportError:
            raise ImportError(
                'Please install Moses tokenizer with: pip3 install sacremoses')

    def tokenize(self, text, return_str=False):
        return self._tok.tokenize(
            self._convert_to_str(text),
            aggressive_dash_splits=self._aggressive_dash_splits,
            return_str=return_str,
            escape=self._escape,
            protected_patterns=self._glossaries)

    def detokenize(self, text, return_str=True):
        return self._detok.detokenize(self._convert_to_list(text),
                                      return_str=return_str,
                                      unescape=True)
Esempio n. 15
0
class MosesProcessor:
    """
    Tokenizer, Detokenizer and Normalizer utilities in Moses
    """

    def __init__(self, lang_id: str):
        self.moses_tokenizer = MosesTokenizer(lang=lang_id)
        self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
        self.normalizer = MosesPunctNormalizer(lang=lang_id)

    def detokenize(self, tokens: List[str]) -> str:
        """
        Detokenizes a list of tokens
        Args:
            tokens: list of strings as tokens
        Returns:
            detokenized string
        """
        return self.moses_detokenizer.detokenize(tokens)

    def tokenize(self, text: str):
        """
        Tokenizes text using Moses -> Sentencepiece.
        """
        return self.moses_tokenizer.tokenize(text, escape=False, return_str=True)

    def normalize(self, text: str):
        return self.normalizer.normalize(text)
Esempio n. 16
0
class MosesTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
        self._tokenizer = SacreMosesTokenizer()
        self._detokenizer = MosesDetokenizer()

    def tokenize(self, sentence):
        return self._tokenizer.tokenize(sentence)

    def detokenize(self, tokens):
        """Unescape Moses punctuation tokens.

        Replaces escape sequences like &#91; with the original characters
        (such as '['), so they better align to the original text.
        """
        return [self._detokenizer.unescape_xml(t) for t in tokens]

    def detokenize_ptb(self, tokens):
        # Not a perfect detokenizer, but a "good-enough" stand in.
        rep_dict = {
            "-LSB-": "[",
            "-RSB-": "]",
            "-LRB-": "(",
            "-RRB-": ")",
            "-LCB-": "{",
            "-RCB-": "}",
            "``": '"',
            "''": '"',
        }
        str1 = self._detokenizer.detokenize(replace_list(tokens, rep_dict))
        return str1
class MosesPreTokenizer:
    def __init__(self, lng, do_lowercase):
        self.mpn = MosesPunctNormalizer()
        self.moses_tokenizer = MosesTokenizer(lang=lng)
        self.do_lowercase = do_lowercase

    def pre_tokenize(self, text):
        return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text))
def tokenize_captions(captions, lang='en'):
    """Tokenizes captions list with Moses tokenizer.
    """

    tokenizer = MosesTokenizer(lang=lang)
    return [
        tokenizer.tokenize(caption, return_str=True) for caption in captions
    ]
Esempio n. 19
0
 def tokenize(txt, to_lower=False):
     assert isinstance(txt, str)
     tokenizer = MosesTokenizer()
     lines = txt.split('\n')
     t = [tokenizer.tokenize(line) for line in lines]
     if to_lower:
         return [[word.lower() for word in line] for line in t]
     else:
         return t
def run_sentence_bleu(candidates: list, references: list,
                      language: str) -> list:
    """ Runs sentence BLEU from Sacrebleu. """
    tokenizer = MosesTokenizer(lang=language)
    candidates = [tokenizer.tokenize(mt, return_str=True) for mt in candidates]
    references = [
        tokenizer.tokenize(ref, return_str=True) for ref in references
    ]
    assert len(candidates) == len(references)
    bleu_scores = []
    for i in tqdm(range(len(candidates)), desc="Running BLEU..."):
        bleu_scores.append(
            corpus_bleu([
                candidates[i],
            ], [[
                references[i],
            ]]).score)
    return bleu_scores
Esempio n. 21
0
def preprocess(source_lang,tcmodel,escape):
	mtok = MosesTokenizer(lang=source_lang)
	mtr = MosesTruecaser(tcmodel)
	sys.stderr.write("model loaded\n")
	for line in sys.stdin:
		tokenized = mtok.tokenize(line,escape=escape)
		truecased = mtr.truecase(" ".join(tokenized))
		sys.stderr.write("sentence processed\n")
		sys.stdout.buffer.write((" ".join(truecased) + "\n").encode("utf-8"))
		sys.stdout.flush()
Esempio n. 22
0
class Tokenizer:
    def __init__(self, language):
        self.language = language
        self.tokenizer = MosesTokenizer(lang=language)

    def __repr__(self):
        return f"Tokenizer({self.language})"

    def __call__(self, line):
        return " ".join(self.tokenizer.tokenize(line, escape=False))
Esempio n. 23
0
class MosesTokenizerWrapper(AbstractTokenizer):
    def __init__(self, do_lower_case: bool = False, escape: bool = False):
        self._tokenizer = MosesTokenizer()
        self._do_lower_case = do_lower_case
        self._escape = escape

    def tokenize_single(self, sentence: str):
        if self._do_lower_case:
            sentence = sentence.lower()
        return self._tokenizer.tokenize(sentence, escape=self._escape)
Esempio n. 24
0
def get_moses_tokenizer(lang):
    try:
        moses_tokenizer = MosesTokenizer(lang=lang)
    except:
        print("WARNING: Moses doesn't have tokenizer for", lang)
        moses_tokenizer = MosesTokenizer(lang='en')

    tokenizer = lambda x: moses_tokenizer.tokenize(x, return_str=True
                                                   )  #string IN -> string OUT
    return tokenizer
Esempio n. 25
0
class Tokenizer(BatchProcessor):

	# default args: ["-a", "-no-escape"]
	def __init__(self, lang, args=["-a"]):

		self.handler = MosesTokenizer(lang=lang)
		self.escape = not ("-no-escape" in args or "--no-escape" in args)
		self.aggressive = "-a" in args

	def process(self, input):

		return self.handler.tokenize(input, aggressive_dash_splits=self.aggressive, return_str=True, escape=self.escape)
Esempio n. 26
0
def main():

    tic = time.time()

    args = parse_args()

    logging.basicConfig(level=logging.DEBUG)
    logging.debug(args)

    if args.tokenize:
        tokenizer = MosesTokenizer(lang=args.lang)

    lines = sys.stdin.readlines()

    all_tokens = []

    for line in lines:
        if args.tokenize:
            t = tokenizer.tokenize(line)
        else:
            t = line.split()
        all_tokens.append(t)

    flat_tokens = chain.from_iterable(all_tokens)

    counter = Counter(flat_tokens)

    # try to free up memory early

    del flat_tokens

    logging.debug("Vocabulary size before/after/max_allowed = %d/%d/%d" %
                  (len(counter.keys()),
                   min(args.vocab_size, len(counter.keys())), args.vocab_size))

    vocabulary = [
        token for token, frequency in counter.most_common(args.vocab_size)
    ]

    for tokens in all_tokens:
        output_tokens = []
        for token in tokens:
            if token in vocabulary:
                output_tokens.append(token)
            else:
                output_tokens.append(args.unk_string)

        output_string = " ".join(output_tokens)
        sys.stdout.write(output_string + "\n")

    toc = time.time() - tic

    logging.debug("Time taken: %f seconds" % toc)
Esempio n. 27
0
class MosesTokenizerFunc(BaseTokenizer):
    "Wrapper around a MosesTokenizer to make it a `BaseTokenizer`."

    def __init__(self, lang: str):
        self.tok = MosesTokenizer(lang)

    def tokenizer(self, t: str) -> List[str]:
        return self.tok.tokenize(t, return_str=False, escape=False)

    def add_special_cases(self, toks: Collection[str]):
        for w in toks:
            assert len(self.tokenizer(
                w)) == 1, f"Tokenizer is unable to keep {w} as one token!"
 def _test_tokenize(self, test_file, language='en'):
     """
     Compares MosesPunctuationNormalizer's output to the output of the
     original Perl script.
     """
     tokenizer = MosesTokenizer(lang=language)
     # Normalize test file with original Perl script and given flags
     path_gold = self._create_gold(test_file, language)
     # Compare to output of original Perl script
     with open(test_file, encoding='utf-8') as u, open(path_gold, encoding='utf-8') as g:
         for text, gold in zip(u, g):
             tokenized = tokenizer.tokenize(text, return_str=True)
             self.assertEqual(tokenized.rstrip(), gold.rstrip())
Esempio n. 29
0
class EnThTranslator:
    """
    English-Thai Machine Translation

    from VISTEC-depa Thailand Artificial Intelligence Research Institute

    Website: https://airesearch.in.th/releases/machine-translation-models/
    """
    def __init__(self):
        self._tokenizer = MosesTokenizer("en")

        self._model_name = _EN_TH_MODEL_NAME

        _download_install(self._model_name)
        self._model = TransformerModel.from_pretrained(
            model_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "models",
            ),
            checkpoint_file="checkpoint.pt",
            data_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "vocab",
            ),
        )

    def translate(self, text: str) -> str:
        """
        Translate text from English to Thai

        :param str text: input text in source language
        :return: translated text in target language
        :rtype: str

        :Example:

        Translate text from English to Thai::

            from pythainlp.translate import EnThTranslator

            enth = EnThTranslator()

            enth.translate("I love cat.")
            # output: ฉันรักแมว

        """
        tokens = " ".join(self._tokenizer.tokenize(text))
        translated = self._model.translate(tokens)
        return translated.replace(" ", "").replace("▁", " ").strip()
Esempio n. 30
0
 def translate(self,
               text: List[str],
               source_lang: str = None,
               target_lang: str = None) -> List[str]:
     """
     Translates list of sentences from source language to target language.
     Should be regular text, this method performs its own tokenization/de-tokenization
     Args:
         text: list of strings to translate
         source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run
         target_lang: if not None, corresponding MosesDecokenizer will be run
     Returns:
         list of translated strings
     """
     mode = self.training
     if source_lang != "None":
         tokenizer = MosesTokenizer(lang=source_lang)
         normalizer = MosesPunctNormalizer(lang=source_lang)
     if target_lang != "None":
         detokenizer = MosesDetokenizer(lang=target_lang)
     try:
         self.eval()
         res = []
         for txt in text:
             if source_lang != "None":
                 txt = normalizer.normalize(txt)
                 txt = tokenizer.tokenize(txt,
                                          escape=False,
                                          return_str=True)
             ids = self.encoder_tokenizer.text_to_ids(txt)
             ids = [self.encoder_tokenizer.bos_id
                    ] + ids + [self.encoder_tokenizer.eos_id]
             src = torch.Tensor(ids).long().to(self._device).unsqueeze(0)
             src_mask = torch.ones_like(src)
             src_hiddens = self.encoder(input_ids=src,
                                        encoder_mask=src_mask)
             beam_results = self.beam_search(
                 encoder_hidden_states=src_hiddens,
                 encoder_input_mask=src_mask)
             beam_results = self.filter_predicted_ids(beam_results)
             translation_ids = beam_results.cpu()[0].numpy()
             translation = self.decoder_tokenizer.ids_to_text(
                 translation_ids)
             if target_lang != "None":
                 translation = detokenizer.detokenize(translation.split())
             res.append(translation)
     finally:
         self.train(mode=mode)
     return res