Beispiel #1
0
 def moses_punct_norm(self, text, lang):
     if lang not in self.cache_moses_punct_normalizer:
         punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
         self.cache_moses_punct_normalizer[lang] = punct_normalizer
     else:
         punct_normalizer = self.cache_moses_punct_normalizer[lang]
     return punct_normalizer.normalize(text)
Beispiel #2
0
    def __init__(self, config_file):
        with open(config_file) as f:
            self.__dict__.update(yaml.safe_load(f))
        assert self.type in {"cn2en", "en2cn"}
        codes = codecs.open(self.codes_file, encoding='utf-8')
        cur_path = os.path.dirname(os.path.realpath(__file__))
        self.tokenizer = BPE(codes)

        if self.type == "en2cn":
            # pre_process: normalize, tokenize, subEntity,to_lower,bpe
            # post_process: delbpe,remove_space
            self.en_tokenizer = os.path.join(cur_path, self.en_tokenizer)
            self.en_normalize_punctuation = sacremoses.MosesPunctNormalizer(
                lang="en")
            self.en_tokenizer = sacremoses.MosesTokenizer(
                lang='en', custom_nonbreaking_prefixes_file=self.en_tokenizer)
        elif self.type == "cn2en":
            # pre_process: tokenize, bpe
            # post_process: delbpe,detruecase,detokenize
            self.detruecase = sacremoses.MosesDetruecaser()
            self.detokenize = sacremoses.MosesDetokenizer(lang='en')
            self.client = aiohttp.ClientSession(
                timeout=aiohttp.ClientTimeout(total=3600),
                connector=aiohttp.TCPConnector(limit=sys.maxsize,
                                               limit_per_host=sys.maxsize))
            self.cn2en_trans_dict = slang_dict(self.trans_dict_file)
            self.chinese_char_pattern = re.compile(u"[\u4E00-\u9FA5]+")
            self.stops = re.compile(u"[.!?!?。。]+")
Beispiel #3
0
    def __init__(self,
                 special=None,
                 min_freq=0,
                 max_size=None,
                 lower_case=False,
                 delimiter=None,
                 vocab_file=None,
                 pretrained_vocab_file: str = None,
                 never_split=None,
                 unk_token="<unk>",
                 eos_token="<eos>",
                 additional_special_tokens=["<formula>"],
                 language="en",
                 **kwargs):
        super().__init__(unk_token=unk_token,
                         eos_token=eos_token,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        self.never_split = never_split
        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
        self.punction_without_space_before_pattern = re.compile(
            r"[^\s][{}]".format(self.punctuation_symbols))
        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern(
        )
        self.language = language
        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
        self.moses_tokenizer = sm.MosesTokenizer(language)
        self.moses_detokenizer = sm.MosesDetokenizer(language)

        # This try... catch... is not beautiful but honestly this tokenizer was not made to be used
        # in a library like ours, at all.
        try:
            vocab_dict = None
            if pretrained_vocab_file is not None:
                # Priority on pickle files (support PyTorch and TF)
                with open(pretrained_vocab_file, "rb") as f:
                    vocab_dict = pickle.load(f)

                # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer
                # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed.
                # We therefore load it with torch, if it's available.
                if type(vocab_dict) == int:
                    if not is_torch_available():
                        raise ImportError(
                            "Not trying to load dict with PyTorch as you need to install pytorch to load "
                            "from a PyTorch pretrained vocabulary, "
                            "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
                        )
                    vocab_dict = torch.load(pretrained_vocab_file)

            if vocab_dict is not None:
                for key, value in vocab_dict.items():
                    if key not in self.__dict__:
                        self.__dict__[key] = value
            elif vocab_file is not None:
                self.build_vocab()

        except Exception as e:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizerFast,"
                "please note they are not compatible.".format(
                    pretrained_vocab_file)) from e

        if vocab_file is not None:
            self.build_vocab()
Beispiel #4
0
    parser.add_argument('-L', '--lang', default='en')
    parser.add_argument('-o', '--overwrite', action='store_true')
    parser.add_argument('-v', '--verbose', action='store_true')
    args = parser.parse_args()

    if os.path.isfile(args.output_tsv) and not args.overwrite:
        print(
            f'output file: {args.output_tsv} exists, use -o/--overwrite to force overwrite'
        )
        exit(1)

    verbose(args, args)

    normalizer = sacremoses.MosesPunctNormalizer(
        lang=args.lang,
        pre_replace_unicode_punct=True,
        post_remove_control_chars=True,
    )
    p_list = set(string.punctuation) - set("'-")

    lines = []
    with open(args.input_tsv, 'r') as f:
        reader = csv.DictReader(
            f,
            delimiter='\t',
            quotechar=None,
            doublequote=False,
            lineterminator='\n',
            quoting=csv.QUOTE_NONE,
        )
        for line in reader:
    def __init__(self,
                 special=None,
                 min_freq=0,
                 max_size=None,
                 lower_case=False,
                 delimiter=None,
                 vocab_file=None,
                 pretrained_vocab_file=None,
                 never_split=None,
                 unk_token="<unk>",
                 eos_token="<eos>",
                 additional_special_tokens=["<formula>"],
                 language="en",
                 **kwargs):
        super().__init__(unk_token=unk_token,
                         eos_token=eos_token,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        self.never_split = never_split
        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
        self.punction_without_space_before_pattern = re.compile(
            r"[^\s][{}]".format(self.punctuation_symbols))
        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern(
        )
        self.language = language
        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
        self.moses_tokenizer = sm.MosesTokenizer(language)
        self.moses_detokenizer = sm.MosesDetokenizer(language)

        try:
            if pretrained_vocab_file is not None:
                # Hack because, honestly this tokenizer was not made to be used
                # in a library like ours, at all.
                vocab_dict = torch.load(pretrained_vocab_file)
                for key, value in vocab_dict.items():
                    if key not in self.__dict__:
                        self.__dict__[key] = value

            if vocab_file is not None:
                self.build_vocab()
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizerFast,"
                "please note they are not compatible.".format(
                    pretrained_vocab_file))

        if vocab_file is not None:
            self.build_vocab()
Beispiel #6
0
 def __init__(
     self,
     special=None,
     min_freq=0,
     max_size=None,
     lower_case=False,
     delimiter=None,
     vocab_file=None,
     pretrained_vocab_file=None,
     never_split=None,
     unk="<unk>",
     eos="<eos>",
     additional_special_tokens=["<formula>"],
     language="en",
     **kw,
 ):
     super().__init__(
         special=special,
         min_freq=min_freq,
         max_size=max_size,
         lower_case=lower_case,
         delimiter=delimiter,
         vocab_file=vocab_file,
         pretrained_vocab_file=pretrained_vocab_file,
         never_split=never_split,
         unk=unk,
         eos=eos,
         additional_special_tokens=additional_special_tokens,
         language=language,
         **kw,
     )
     if never_split is None:
         never_split = self.all_special_tokens
     if special is None:
         special = []
     self.counter = Counter()
     self.special = special
     self.min_freq = min_freq
     self.max_size = max_size
     self.lower_case = lower_case
     self.delimiter = delimiter
     self.vocab_file = vocab_file
     self.never_split = never_split
     self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
     self.punction_without_space_before_pattern = re.compile(
         rf"[^\s][{self.punctuation_symbols}]"
     )
     self.punctuation_with_space_around_pattern = (
         self._compile_space_around_punctuation_pattern()
     )
     self.language = language
     self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
     self.moses_tokenizer = sm.MosesTokenizer(language)
     self.moses_detokenizer = sm.MosesDetokenizer(language)
     try:
         vocab_dict = None
         if pretrained_vocab_file is not None:
             with open(pretrained_vocab_file, "rb") as f:
                 vocab_dict = pickle.load(f)
             if type(vocab_dict) == int:
                 if not is_torch_available():
                     raise ImportError(
                         "Not trying to load dict with PyTorch as you need to install pytorch to load "
                         "from a PyTorch pretrained vocabulary, "
                         "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
                     )
                 vocab_dict = torch.load(pretrained_vocab_file)
         if vocab_dict is not None:
             for key, value in vocab_dict.items():
                 if key not in self.__dict__:
                     self.__dict__[key] = value
         elif vocab_file is not None:
             self.build_vocab()
     except Exception as e:
         raise ValueError(
             f"Unable to parse file {pretrained_vocab_file}. Unknown format. "
             "If you tried to load a model saved through TokenizerFast, "
             "please note they are not compatible."
         ) from e
     if vocab_file is not None:
         self.build_vocab()