Esempio n. 1
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        subset = self._create_subset_file(afm, corpus)

        # Create WordPiece model with a normalizer and pre-tokenizer. Note that
        # BERT-specific normalizer and pre-tokenizer are used in this model.
        tokenizer = Tokenizer(WordPiece())
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()

        # Train tokenizer model with subset of corpus.
        trainer = WordPieceTrainer(vocab_size=self.vocab_size,
                                   min_frequency=2,
                                   show_progress=True,
                                   limit_alphabet=self.limit_alphabet,
                                   special_tokens=[self.unk_token] +
                                   self.special_tokens,
                                   continuing_subword_prefix='##')
        tokenizer.train(trainer, [subset.name])

        # Save trained vocabulary to an auxiliary output file.
        vocab = afm.create()
        tokenizer.model.save(os.path.dirname(vocab.name))

        os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'),
                  vocab.name)

        return vocab
Esempio n. 2
0
def train_custom_tokenizer(files: List[str], tokenizer_file: str,
                           **kwargs) -> BertWordPieceTokenizer:
    """ Tokenizerの学習・保存処理:custom PreTokenizer付きのTokenizerを学習・保存する。
    """

    tokenizer = BertWordPieceTokenizer(
        handle_chinese_chars=False,  # for japanese
        strip_accents=False,  # for japanese
    )
    tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(
        MecabPreTokenizer())

    # 与えられたコーパスファイル集合からサブワード分割を学習
    tokenizer.train(files, **kwargs)

    # vocab情報に加えて、前処理等パラメータ情報を含んだトークナイザ設定のJSONを保存
    # NOTE: Pythonで書かれたcustom PreTokenizerはシリアライズできないので、RustベースのPreTokenizerをダミー注入してシリアライズ
    # JSONにはダミーのPreTokenizerが記録されるので、ロード時にcustom PreTokenizerを再設定する必要がある。
    tokenizer._tokenizer.pre_tokenizer = BertPreTokenizer()
    tokenizer.save(tokenizer_file)

    # (Optional) .txt形式のvocabファイルは f"vocab-{filename}.txt" で保存される(外部の処理で欲しい場合)
    filename = "wordpiece"
    model_files = tokenizer._tokenizer.model.save(
        str(pathlib.Path(tokenizer_file).parent), filename)

    return tokenizer
Esempio n. 3
0
 def save_pretrained(self,
                     dir,
                     legacy_format=None,
                     pre=None,
                     push_to_hub=False,
                     **kw):
     self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
     return super().save_pretrained(dir, legacy_format, pre, push_to_hub,
                                    **kw)
Esempio n. 4
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        add_special_tokens: bool = True,
        unk_token: str = "[UNK]",
        sep_token: str = "[SEP]",
        cls_token: str = "[CLS]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: bool = True,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece.from_files(vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordPiece.empty())

        tokenizer.add_special_tokens([unk_token, sep_token, cls_token])
        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if add_special_tokens and vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(sep_token)
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(cls_token)
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (sep_token, sep_token_id), (cls_token, cls_token_id))
        tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "add_special_tokens": add_special_tokens,
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
Esempio n. 5
0
 def save_pretrained(
     self,
     save_directory,
     legacy_format=None,
     filename_prefix=None,
     push_to_hub=False,
     **kwargs,
 ):
     self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
     return super().save_pretrained(save_directory, legacy_format,
                                    filename_prefix, push_to_hub, **kwargs)
Esempio n. 6
0
def tokenize_corpus(
        input_file: str,
        output_file: str,
        vocab_file: str,
        unk_token: str = '<unk>',
        control_tokens: List[str] = []):
    r"""Tokenize corpus sentences through trained **WordPiece** model.

    Arguments:
        input_file (str): Input corpus file path.
        output_file (str): Output file path.
        vocab_file (str): Trained vocabulary file path.
        unk_token (str): Unknown token in the vocabulary.
        control_tokens (list): Control tokens in the vocabulary.
    """
    # Create `WordPiece` model and add special tokens. Note that `unk_token`
    # is also a special token.normalizer and pre-tokenizer.
    tokenizer = Tokenizer(models.WordPiece(vocab_file, unk_token=unk_token))
    tokenizer.add_special_tokens([unk_token] + control_tokens)

    # Use BERT-specific normalizer, pre-tokenizer and **WordPiece** decoder.
    tokenizer.normalizer = BertNormalizer(strip_accents=False)
    tokenizer.pre_tokenizer = BertPreTokenizer()
    tokenizer.decoder = decoders.WordPiece(prefix='##')

    with open(input_file, 'r', encoding='utf-8') as src, \
            open(output_file, 'w', encoding='utf-8') as dst:
        # Count total lines in corpus.
        total_lines = 0
        for _ in src:
            total_lines += 1

        # Move the corpus file to first.
        src.seek(0)

        buffer = []
        for line in tqdm.tqdm(src,
                              desc='[*] tokenize corpus',
                              total=total_lines):
            buffer.append(line)

            # Tokenize buffered sentences and write to `output_file`.
            if len(buffer) > 10000:
                for t in tokenizer.encode_batch(buffer):
                    dst.write(' '.join(t.tokens) + '\n')
                buffer.clear()

        # Process the remained buffer.
        if buffer:
            for t in tokenizer.encode_batch(buffer):
                dst.write(' '.join(t.tokens) + '\n')
Esempio n. 7
0
 def __init__(self, pretokenizer='moses'):
     self.tagger = PerceptronTagger()
     self.pretok_type = pretokenizer
     if pretokenizer == 'bertpretokenizer':
         self.pretokenizer = BertPreTokenizer()
     elif pretokenizer == 'moses':
         self.pretokenizer = MosesTokenizer()
         self.detokenizer = MosesDetokenizer()
     elif pretokenizer == 'whitespace':
         pass
     else:
         raise ValueError(
             "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'."
         )
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        unk_token: Optional[str] = "<unk>",
        suffix: Optional[str] = "</w>",
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        # OpenAI normalization is the same as Bert
        normalizers += [BertNormalizer()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = BPEDecoder(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Esempio n. 9
0
    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized
Esempio n. 10
0
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: Optional[bool] = None,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab is not None:
            tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if vocab is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
Esempio n. 11
0
 def __getstate__(self):
     state = self.__dict__.copy()
     state["_tokenizer"].pre_tokenizer = BertPreTokenizer()
     return state
Esempio n. 12
0
 def test_instantiate(self):
     assert BertPreTokenizer() is not None
     assert isinstance(BertPreTokenizer(), PreTokenizer)
Esempio n. 13
0
def train_tokenizer(
        input_file: str,
        vocab_file: str,
        temporary: str,
        subset_size: int = 512000000,
        vocab_size: int = 8000,
        limit_alphabet: int = 6000,
        unk_token: str = '<unk>',
        control_tokens: List[str] = []):
    r"""Train **WordPiece** tokenizer and save trained subword vocabulary.

    Note:
        Since tokenizers_ reads whole file data in training, this function
        could occur memory errors if `input_file` is too large. Under the
        assumption that `input_file` is shuffled randomly, the subset of input
        corpus will be used in training.

    Caution:
        The subset of input corpus is saved in `temporary` directory. Please be
        careful not to delete the file while executing this function.

    Arguments:
        input_file (str): Input file path.
        vocab_file (str): Output vocabulary file path.
        temporary (str): Temporary directory where the subset of corpus would
            be saved.
        subset_size (int): The maximum number of lines in the subset.
        vocab_size (int): The number of subwords in the vocabulary.
        limit_alphabet (int): The maximum number of alphabets in vocabulary.
        unk_tokens (str): Unknown token in the vocabulary.
        control_tokens (list): Control tokens in the vocabulary.

    .. _tokenizers: https://github.com/huggingface/tokenizers
    """
    # Create **WordPiece** model and add normalizer and pre-tokenizer.
    # BERT-specific normalizer and pre-tokenizer are used.
    tokenizer = Tokenizer(models.WordPiece())

    tokenizer.normalizer = BertNormalizer(strip_accents=False)
    tokenizer.pre_tokenizer = BertPreTokenizer()

    # Split the head of input corpus file and save in `temporary` directory.
    subset_file = random_filename(temporary)
    _split_subset_from_file(input_file, subset_file, subset_size)

    # Train the model with splitted subset of corpus.
    trainer = WordPieceTrainer(vocab_size=vocab_size,
                               min_frequency=2,
                               show_progress=True,
                               limit_alphabet=limit_alphabet,
                               special_tokens=[unk_token] + control_tokens,
                               continuing_subword_prefix='##')
    tokenizer.train(trainer, [subset_file])

    # Save trained subword vocabulary in `temporary` directory and rename to
    # `vocab_file`.
    tokenizer.model.save(temporary)
    os.rename(os.path.join(temporary, 'vocab.txt'), vocab_file)

    # Remove temporary subset corpus.
    os.remove(subset_file)
Esempio n. 14
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')
Esempio n. 15
0
 def test_instantiate(self):
     assert BertPreTokenizer() is not None
     assert isinstance(BertPreTokenizer(), PreTokenizer)
     assert isinstance(BertPreTokenizer(), BertPreTokenizer)
     assert isinstance(pickle.loads(pickle.dumps(BertPreTokenizer())), BertPreTokenizer)
Esempio n. 16
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                out_f.write(
                    json.dumps((examples_per_file[input_file],
                                len(label_counter))) + '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    for example, labels in g:

                        example_batch.append(example)
                        labels_batch.append(labels)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels in zip(example_batch,
                                                       labels_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                out_f.write(
                                    json.dumps([example.ids, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels in zip(example_batch, labels_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        out_f.write(json.dumps([example.ids, labels]) + '\n')
Esempio n. 17
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 unk_token: Union[str, AddedToken] = "<unk>",
                 sep_token: Union[str, AddedToken] = "</s>",
                 cls_token: Union[str, AddedToken] = "<s>",
                 nl_token: Union[str, AddedToken] = "<nl>",
                 pad_token: Union[str, AddedToken] = "<pad>",
                 mask_token: Union[str, AddedToken] = "<mask>",
                 clean_text: bool = True,
                 handle_chinese_chars: bool = True,
                 separate_numbers: bool = True,
                 strip_accents: bool = True,
                 lowercase: bool = True,
                 wordpieces_prefix: str = "##",
                 special_chars: str = SPECIAL_CHARS,
                 zh_norm: bool = True,
                 handle_simpl: bool = True,
                 do_postprocess: bool = False):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece(vocab_file, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(nl_token)) is not None:
            tokenizer.add_special_tokens([str(nl_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        tokenizer.normalizer = Sequence([
            NFKC(),
            BertNormalizer(clean_text=clean_text,
                           handle_chinese_chars=handle_chinese_chars,
                           separate_numbers=separate_numbers,
                           strip_accents=strip_accents,
                           lowercase=lowercase,
                           special_chars=special_chars,
                           zh_norm=zh_norm,
                           handle_simpl=handle_simpl)
        ])
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if vocab_file is not None and do_postprocess:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")
            tokenizer.post_processor = BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))

        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "nl_token": nl_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "separate_numbers": separate_numbers,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "special_chars": special_chars,
            "zh_norm": zh_norm,
            "handle_simpl": handle_simpl,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)