Beispiel #1
0
 def test_normalization_pipeline(self):
     moses_norm_unicode = MosesPunctNormalizer(
         pre_replace_unicode_punct=True, post_remove_control_chars=True
     )
     text = u"0《123》      456%  '' 【789】"
     expected = u'0"123" 456% " [789]'
     assert moses_norm_unicode.normalize(text) == expected
Beispiel #2
0
 def __init__(self, lang: str, remove_non_printable_char: bool = True,
              unicode_norm_form: Optional[str] = None):
     self._remove_non_printable_char = remove_non_printable_char
     self._moses_normalizer = MosesPunctNormalizer(lang)
     self._unicode_norm_form = unicode_norm_form
     if unicode_norm_form is not None:
         assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\
             'Unsupported unicode normalization format, you may refer to ' \
             'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \
             'more details.'
     self.__warmup()
Beispiel #3
0
def normalize_file(iterator, language, processes, quiet):
    print(args, kwargs)
    moses = MosesPunctNormalizer(
        language,
    )
    moses_normalize = partial(moses.normalize)
    return parallel_or_not(iterator, moses_normalize, processes, quiet)
Beispiel #4
0
def normalize_file(
    language, processes, normalize_quote_commas, normalize_numbers, encoding
):
    moses = MosesPunctNormalizer(
        language,
        norm_quote_commas=normalize_quote_commas,
        norm_numbers=normalize_numbers,
    )
    moses_normalize = partial(moses.normalize)

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                # TODO: Actually moses_normalize(fin.read()) gives the same output
                #       and it's a lot better but it's inconsistent with the other
                #       preprocessing interfaces, so we're doing it line by line here.
                for line in tqdm(fin.readlines()):
                    # Note: not stripping newlines, so don't need end='\n' when printing to stdout.
                    print(moses_normalize(line), end="", file=fout)
            else:
                for outline in parallelize_preprocess(
                    moses_normalize, fin.readlines(), processes, progress_bar=True
                ):
                    # Note: not stripping newlines, so don't need end='\n' when printing to stdout.
                    print(outline, end="", file=fout)
Beispiel #5
0
def normalize_file(iterator, language, processes, quiet, replace_unicode_puncts):
    moses = MosesPunctNormalizer(
        language,
        pre_replace_unicode_punct=replace_unicode_puncts,
    )
    moses_normalize = partial(moses.normalize)
    return parallel_or_not(iterator, moses_normalize, processes, quiet)
Beispiel #6
0
 def test_moses_normalize_documents(self):
     moses = MosesPunctNormalizer()
     # Examples from normalizing big.txt
     inputs = [
         "The United States in 1805 (color map)                 _Facing_     193",
         "=Formation of the Constitution.=--(1) The plans before the convention,",
         "directions--(1) The infective element must be eliminated. When the ulcer",
         "College of Surgeons, Edinburgh.)]",
     ]
     expected = [
         "The United States in 1805 (color map) _Facing_ 193",
         "=Formation of the Constitution.=-- (1) The plans before the convention,",
         "directions-- (1) The infective element must be eliminated. When the ulcer",
         "College of Surgeons, Edinburgh.) ]",
     ]
     for text, expect in zip(inputs, expected):
         assert moses.normalize(text) == expect
Beispiel #7
0
    def test_moses_normalize_quote_comma(self):
        moses_norm_quote = MosesPunctNormalizer("en", norm_quote_commas=True)
        moses_no_norm_quote = MosesPunctNormalizer("en", norm_quote_commas=False)
        text = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS".'

        expected_norm_quote = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS."'
        assert moses_norm_quote.normalize(text) == expected_norm_quote

        expected_no_norm_quote = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS".'
        assert moses_no_norm_quote.normalize(text) == expected_no_norm_quote
Beispiel #8
0
    def test_moses_normalize_numbers(self):
        # See https://stackoverflow.com/a/55233871/610569
        moses_norm_num = MosesPunctNormalizer("en", norm_numbers=True)
        moses_no_norm_num = MosesPunctNormalizer("en", norm_numbers=False)

        text = u"12{}123".format(u"\u00A0")
        expected = u"12.123"
        assert moses_norm_num.normalize(text) == expected

        text = expected = u"12 123"
        assert moses_no_norm_num.normalize(text) == expected
Beispiel #9
0
class MosesNormalizer:
    """Normalizes the input sentence. Currently, we support the combination of the

    Moses Punctuation Normalizer 'normalize-punctuation.perl' and the
     'remove-non-printing-char.perl' in [mosesdecoder](https://github.com/moses-smt/mosesdecoder):

    Also, we will normalize the

    Parameters
    ----------
    lang
        The input language
    remove_non_printable_char
        Whether to remove the non-printable unicode characters in the input
    unicode_norm_form
        The unicode normalization format used. Supported

    """
    def __init__(self,
                 lang: str,
                 remove_non_printable_char: bool = True,
                 unicode_norm_form: Optional[str] = None):
        self._remove_non_printable_char = remove_non_printable_char
        self._moses_normalizer = MosesPunctNormalizer(lang)
        self._unicode_norm_form = unicode_norm_form
        if unicode_norm_form is not None:
            assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\
                'Unsupported unicode normalization format, you may refer to ' \
                'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \
                'more details.'
        self.__warmup()

    def __warmup(self):
        self('hello world')

    def __call__(self, sentence: str) -> str:
        if self._unicode_norm_form:
            sentence = unicodedata.normalize(self._unicode_norm_form, sentence)
        sentence = self._moses_normalizer.normalize(sentence)
        if self._remove_non_printable_char:
            return non_printing_char_regex.sub(' ', sentence)
        else:
            return sentence
Beispiel #10
0
def normalize_file(
    iterator,
    language,
    processes,
    quiet,
    normalize_quote_commas,
    normalize_numbers,
    replace_unicode_puncts,
    remove_control_chars,
):
    moses = MosesPunctNormalizer(
        language,
        norm_quote_commas=normalize_quote_commas,
        norm_numbers=normalize_numbers,
        pre_replace_unicode_punct=replace_unicode_puncts,
        post_remove_control_chars=remove_control_chars,
    )
    moses_normalize = partial(moses.normalize)
    return parallel_or_not(iterator, moses_normalize, processes, quiet)
Beispiel #11
0
def normalize_file(
    language, processes, encoding, quiet
):
    moses = MosesPunctNormalizer(
        language,
    )
    moses_normalize = partial(moses.normalize)

    def processor(iterator):
        print(processes)
        if processes == 1:
            for line in list(iterator):
                yield moses_normalize(line)
        else:
            for outline in parallelize_preprocess(
                moses_normalize, list(iterator), processes, progress_bar=(not quiet)
            ):
                yield outline
    return processor
Beispiel #12
0
 def cleanup_transcript(language,
                        transcript,
                        lowercase=True,
                        remove_punctuation=True):
     if lowercase:
         transcript = transcript.lower()
     if language not in ["zh", "ja"]:
         if language not in TranscriptDataPipeline.PUNC_NORMERS:
             TranscriptDataPipeline.PUNC_NORMERS[
                 language] = MosesPunctNormalizer(lang=language)
         transcript = TranscriptDataPipeline.PUNC_NORMERS[
             language].normalize(transcript)
         transcript = transcript.replace("' s ", "'s ").replace(
             "' ve ",
             "'ve ").replace("' m ",
                             "'m ").replace("' t ",
                                            "'t ").replace("' re ", "'re ")
     if remove_punctuation:
         transcript = PUNC_PATTERN.sub(" ", transcript)
     transcript = " ".join(transcript.strip().split())
     return transcript
Beispiel #13
0
 def test_replace_unicode_punct(self):
     moses_norm_unicode = MosesPunctNormalizer()
     text = u"0《123》 456% 【789】"
     expected = u'0"123" 456% [789]'
     assert moses_norm_unicode.replace_unicode_punct(text) == expected
Beispiel #14
0
 def test_moses_noralize_single_apostrophe(self):
     moses_norm_num = MosesPunctNormalizer("en")
     text = u"yesterday ’s reception"
     expected = u"yesterday 's reception"
     assert moses_norm_num.normalize(text) == expected
Beispiel #15
0
def get_moses_punct_normalizer(language='en'):
    return MosesPunctNormalizer(pre_replace_unicode_punct=True, lang=language)
Beispiel #16
0
def preprocess(src_file, mt_file, output_dir, tokenize_lang=None):
    """
        pre-process input file before post-editing
        split at <br> and remove <i> tags and music symbols.
        store everything in a codes file in output_dir

        Args:
            src_file: src_file of the translation to be preprocessed
            mt_file: output of the mt system file to be preprocessed
            output_dir: output directory to output the preprocessed files and codes file

    """

    punct_normalizer = MosesPunctNormalizer()

    # set tokenizer
    tokenizer = None
    if tokenize_lang:
        tokenizer = MosesTokenizer(lang=tokenize_lang)

    code_file = output_dir+'/codes.'+os.path.basename(mt_file)
    src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre'
    mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre'
    with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt:
        idx=0
        for src,mt in zip(fsrc,fmt):
            src, mt = src.strip(), mt.strip()
            

            idx+=1
            
            # standardize br tags
            src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE)
            mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE)


            # if number of <br> is same, split and save it as multiple lines
            src_split = re.split(r'\s*<br>\s*',src)
            mt_split = re.split(r'\s*<br>\s*',mt)

            # if the src, mt, do not have the same number of <br>, then do not split it
            if not (len(src_split) == len(mt_split)):
                src_split = [src]
                mt_split = [mt]
                


            for src_part, mt_part in zip(src_split, mt_split):
                code = "{}\t".format(idx)

                # check if they start with the hyphen
                has_hyphen = False
                if src_part.startswith('-'):
                    has_hyphen = True
                    src_part = src_part[1:].lstrip()

                if mt_part.startswith('-'):
                    has_hyphen = True
                    mt_part = mt_part[1:].lstrip()

                # check if they start with the music symbol
                music_syms = ('♫','♬','♪')
                has_music = False
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part):
                    has_music = True
                    src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part)

                #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms):
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part):                
                    has_music = True
                    mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part)

                # check if it has enclosing italics tags. otherwise leave it as it is
                itag = '<i>'
                eitag = '</i>'
                has_itag = False
                if src_part.startswith(itag) or src_part.endswith(eitag):
                    has_itag = True

                if mt_part.startswith(itag) or mt_part.endswith(eitag):
                    has_itag = True


                #if re.match(r'^<i>[^<]*</i>$', src_part):
                if has_hyphen == True:
                    code += 'HYPHENBEGIN\t'
                if has_music == True:
                    code += 'MUSIC\t'
                if has_itag == True:
                    code += 'ITALICTAGS\t'

                src_part = punct_normalizer.normalize(cleanup(src_part))
                mt_part = punct_normalizer.normalize(cleanup(mt_part))

                if tokenizer:
                    src_part = " ".join(tokenizer.tokenize(src_part, escape=False))
                    mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False))

                fosrc.write(src_part.strip()+'\n')
                fomt.write(mt_part.strip()+'\n')
                fcodes.write("{}\n".format(code))
Beispiel #17
0
def main(args):
    normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn)
    for line in sys.stdin:
        print(normalizer.normalize(line.rstrip()), flush=True)
Beispiel #18
0
def main(args):

    print(args, file=sys.stderr)

    if args.human_scores:
        systems = []
        scores = {}
        for line in open(args.human_scores):
            system, score = line.rstrip().split()
            scores[system] = float(score)
        for system in args.systems:
            system_name = '.'.join(os.path.basename(system).split('.')[1:-1])
            if system_name not in scores:
                print(f"COULDN'T FIND SYSTEM {system_name}", file=sys.stderr)
            elif scores[system_name] > args.scope:
                systems.append(system)
    else:
        systems = args.systems

    if args.normalize:
        normalizer = MosesPunctNormalizer(lang='en', penn=False)

    if args.spm:
        sp = spm.SentencePieceProcessor()
        sp.Load(args.spm)

    # leave one out
    fds = [open(file) for file in systems]

    num_constraints = 0
    num_skipped = 0
    for lineno, (ref, *systems) in enumerate(zip(open(args.reference), *fds),
                                             1):

        def preprocess(text):
            if args.normalize:
                text = normalizer.normalize(text)
            if args.spm:
                text = ' '.join(sp.EncodeAsPieces(text))
            return ' '.join(text.split()[:args.maxlen])

        if len(ref.split()) > args.maxlen:
            continue

        ref_ngrams = sacrebleu.extract_ngrams(ref,
                                              min_order=args.ngram_min,
                                              max_order=args.ngram_max)

        ngrams = Counter()
        for system in systems:
            ngrams += sacrebleu.extract_ngrams(system,
                                               min_order=args.ngram_min,
                                               max_order=args.ngram_max)

        for ngram in ref_ngrams.keys():
            ngrams[ngram] = 0
        ngrams -= ref_ngrams
        if args.threshold <= 1:
            attested_ngrams = [
                ngram for ngram in ngrams.keys()
                if (ngrams[ngram] / len(systems)) >= args.threshold
            ]
        else:
            attested_ngrams = [
                ngram for ngram in ngrams.keys()
                if ngrams[ngram] >= args.threshold
            ]

        used_ngrams = []
        for ngram in sorted(attested_ngrams, key=len, reverse=True):
            for used in used_ngrams:
                if ngram in used:
                    #                    print(f"** {lineno} already saw '{ngram}' in '{used}', skipping", file=sys.stderr)
                    num_skipped += 1
                    break
            else:
                num_constraints += 1
                used_ngrams.append(ngram)
                j = {
                    'sentno': lineno,
                    'text': preprocess(ref),
                    'constraints': [preprocess(ngram)]
                }
                print(json.dumps(j, ensure_ascii=False), flush=True)
        #print(*attested_ngrams, sep='\t', flush=True)

    print(
        f"Created {num_constraints} constrained sentences, skipping {num_skipped} smaller ones",
        file=sys.stderr)