Beispiel #1
0
def tokenize_corpus(
        input_file: str,
        output_file: str,
        vocab_file: str,
        unk_token: str = '<unk>',
        control_tokens: List[str] = []):
    r"""Tokenize corpus sentences through trained **WordPiece** model.

    Arguments:
        input_file (str): Input corpus file path.
        output_file (str): Output file path.
        vocab_file (str): Trained vocabulary file path.
        unk_token (str): Unknown token in the vocabulary.
        control_tokens (list): Control tokens in the vocabulary.
    """
    # Create `WordPiece` model and add special tokens. Note that `unk_token`
    # is also a special token.normalizer and pre-tokenizer.
    tokenizer = Tokenizer(models.WordPiece(vocab_file, unk_token=unk_token))
    tokenizer.add_special_tokens([unk_token] + control_tokens)

    # Use BERT-specific normalizer, pre-tokenizer and **WordPiece** decoder.
    tokenizer.normalizer = BertNormalizer(strip_accents=False)
    tokenizer.pre_tokenizer = BertPreTokenizer()
    tokenizer.decoder = decoders.WordPiece(prefix='##')

    with open(input_file, 'r', encoding='utf-8') as src, \
            open(output_file, 'w', encoding='utf-8') as dst:
        # Count total lines in corpus.
        total_lines = 0
        for _ in src:
            total_lines += 1

        # Move the corpus file to first.
        src.seek(0)

        buffer = []
        for line in tqdm.tqdm(src,
                              desc='[*] tokenize corpus',
                              total=total_lines):
            buffer.append(line)

            # Tokenize buffered sentences and write to `output_file`.
            if len(buffer) > 10000:
                for t in tokenizer.encode_batch(buffer):
                    dst.write(' '.join(t.tokens) + '\n')
                buffer.clear()

        # Process the remained buffer.
        if buffer:
            for t in tokenizer.encode_batch(buffer):
                dst.write(' '.join(t.tokens) + '\n')
    def test_encode(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can encode single sequence
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name", "is", "john"]
        assert type(output.ids) == list
        assert type(output.type_ids) == list
        assert type(output.offsets) == list
        with pytest.warns(DeprecationWarning):
            assert type(output.words) == list
        assert type(output.word_ids) == list
        assert type(output.special_tokens_mask) == list
        assert type(output.attention_mask) == list
        assert type(output.overflowing) == list

        # Can encode a pair of sequences
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "name", "is", "john", "pair"]
        assert isinstance(pickle.loads(pickle.dumps(output)), Encoding)

        # Can encode a single pre-tokenized sequence
        output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
        assert output.tokens == ["my", "name", "is", "john"]

        # Can encode a batch with both a single sequence and a pair of sequences
        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
        assert len(output) == 2
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((DataTrainingArguments, CustomOthersArguments))

    (data_args, custom_args) = parser.parse_args_into_dataclasses()

    train_files = list(sorted(glob.glob(f'{data_args.train_dir}/*.{data_args.ext}')))
    validation_files = list(sorted(glob.glob(f'{data_args.eval_dir}/*.{data_args.ext}')))

    additional_special_tokens = ADDITIONAL_SPECIAL_TOKENS

    pre_tokenizer_func = PRE_TOKENIZERS_MAP.get(custom_args.pre_tokenizer_type, None)
    if pre_tokenizer_func is None:
        raise NotImplementedError
    elif custom_args.pre_tokenizer_type == 'sefr_cut':
        raise ValueError('sefr_cut is slow use fake_sefr_cu with sefr_cut_pre_tokenizer instead')

    if not os.path.exists(custom_args.output_file) or custom_args.overwrite_output_file:
        trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func,
                                   vocab_size=custom_args.vocab_size,
                                   vocab_min_freq=custom_args.vocab_min_freq,
                                   input_files=train_files,
                                   additional_special_tokens=additional_special_tokens)
        trainer.count_parallel()
        trainer.save_vocab(custom_args.output_file)
    if custom_args.pre_tokenizer_type == 'fake_sefr_cut':
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            FakeSefrCustomTokenizer(PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token']))
    else:
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
    tokenizer = Tokenizer(models.WordLevel.from_file(custom_args.output_file, unk_token='<unk>'))
    tokenizer.pre_tokenizer = custom_pre_tokenizer

    if custom_args.debug:
        print('Tokenize following text.')
        texts = ['<s>โรนัลโดเขาได้เล่นกับทีม</s>', 'โปรตุเกสมีโรนัลโด',
                 'โรนัลโดเขาได้เล่นกับทีม\nโปรตุเกสมีโรนัลโด']
        ids = [e.ids for e in tokenizer.encode_batch(texts)]
        decoded_texts = tokenizer.decode_batch(ids)
        decoded_texts = [text.replace(' ', '') for text in decoded_texts]
        for text, i, decoded_text in zip(texts, ids, decoded_texts):
            print('Text: ', text, '>>', 'Tokenized: ', i, '>>', 'Decoded: ', decoded_text)
        with open(validation_files[0], 'r') as f:
            while True:
                line = f.readline()
                if line:
                    line = line.strip()
                    if len(line) > 0 and not line.isspace():
                        encoded = tokenizer.encode(line)
                        decoded = tokenizer.decode(encoded.ids).replace(' ', '')
                        print('Text: ', line, '>>', encoded.ids, '>>', decoded)
                else:
                    break
Beispiel #4
0
    def __init__(self,
                 tokenizer: Tokenizer,
                 args,
                 file_paths: str,
                 block_size=512):
        assert all([os.path.isfile(file_path) for file_path in file_paths])

        block_size = block_size - 2  # Reduce by 2 to account for [CLS] and [SEP] tokens

        directory, filename = os.path.split(file_paths[0])
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size) +
            "_" + Path(filename).stem)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Reading dataset at %s", file_paths)
            text = []
            for file_path in file_paths:
                with open(file_path, encoding="utf-8") as f:
                    text += f.readlines()

            logger.info("Creating features from dataset file at %s", directory)
            # Get all token IDs except [CLS] and [SEP] and flat map IDs
            tokenized_text = [
                t for tokenized in tokenizer.encode_batch(text)
                for t in tokenized.ids[1:-1]
            ]
            cls_token, sep_token = tokenizer.encode('').ids

            self.examples = []
            for i in range(0,
                           len(tokenized_text) - block_size + 1,
                           block_size):  # Truncate in block of block_size
                self.examples.append([cls_token] +
                                     tokenized_text[i:i + block_size] +
                                     [sep_token])
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            Path(cached_features_file).parent.mkdir(exist_ok=True,
                                                    parents=True)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #5
0
    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized
Beispiel #6
0
class Wikitext2RawConverter(BaseFormatConverter):
    __provider__ = "wikitext2raw"
    annotation_types = (LanguageModelingAnnotation, )

    @classmethod
    def parameters(cls):
        configuration_parameters = super().parameters()
        configuration_parameters.update({
            'testing_file': PathField(description="Path to testing file."),
            'merges_file': PathField(description="Path to merges file."),
            'vocab_file': PathField(description='Path to vocabulary file.'),
            'max_seq_length': NumberField(
                description='The maximum total input sequence length after tokenization.',
                optional=True, default=128, value_type=int
            ),
        })

        return configuration_parameters

    def configure(self):
        self.testing_file = self.get_value_from_config('testing_file')
        self.vocab_file = self.get_value_from_config('vocab_file')
        self.merges_file = self.get_value_from_config('merges_file')
        self.max_seq_length = int(self.get_value_from_config('max_seq_length'))
        self.tokenizer = Tokenizer(BPE(str(self.vocab_file), str(self.merges_file)))
        self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
        self.tokenizer.decoder = decoders.ByteLevel()

    def convert(self, check_content=False, progress_callback=None, progress_interval=100, **kwargs):
        with open(self.testing_file, encoding="utf-8") as f:
            text = f.read()

        tokens = self.tokenizer.encode_batch([text])

        encoding = tokens[0]
        annotations = []
        unique_id = 1000000000
        for idx in range(0, len(encoding.ids) - self.max_seq_length + 1, self.max_seq_length):
            ids = encoding.ids[idx: idx + self.max_seq_length]
            tokens = encoding.tokens[idx:idx + self.max_seq_length]
            identifier = ['input_ids_{}'.format(idx), 'labels_{}'.format(idx)]
            annotation = LanguageModelingAnnotation(
                identifier,
                np.array(unique_id),
                np.array([ids]),
                tokens,
                labels=np.array(ids),
            )
            annotations.append(annotation)
            unique_id += 1

        return ConverterReturn(annotations, None, None)
Beispiel #7
0
    def test_padding(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # By default it does nothing when encoding single sequence
        tokenizer.enable_padding()
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name"]

        # Can pad to the longest in a batch
        output = tokenizer.encode_batch(["my name", "my name is john"])
        assert all([len(encoding) == 4 for encoding in output])

        # Can pad to the specified max length otherwise
        tokenizer.enable_padding(max_length=4)
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["my", "name", "pair", "[PAD]"]
Beispiel #8
0
    def __init__(self,
                 tokenizer: Tokenizer,
                 args,
                 file_paths: str,
                 block_size=512):
        assert all([os.path.isfile(file_path) for file_path in file_paths])
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s", file_paths[0])

        lines = []
        for file_path in file_paths:
            with open(file_path, encoding="utf-8") as f:
                lines += [
                    line for line in f.read().splitlines()
                    if (len(line) > 0 and not line.isspace())
                ]

        self.examples = truncate([x.id for x in tokenizer.encode_batch(lines)])
class HuggingFaceWordLevelTokenizer(TokenizerBase):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        from tokenizers import Tokenizer, models, normalizers, pre_tokenizers

        self.tokenizer = Tokenizer(
            models.WordLevel(unk_token=self.unknown_token))
        self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        if self.lower:
            self.tokenizer.normalizer = normalizers.Lowercase()

    def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None):
        from tokenizers import trainers

        trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size,
                                            special_tokens=list(
                                                self.special_tokens))
        self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer)
        self.token_to_id = self.tokenizer.get_vocab()
        self.id_to_token = {
            token_id: token
            for token, token_id in self.token_to_id.items()
        }

    def encode(self, texts):
        id_seqs = self.tokenizer.encode_batch(texts)
        id_seqs = [id_seq.ids for id_seq in id_seqs]
        return self._post_process(
            id_seqs,
            pad_id=self.token_to_id[self.pad_token]
            if self.pad_token else None,
            sos_id=self.token_to_id[self.sos_token]
            if self.sos_token else None,
            eos_id=self.token_to_id[self.eos_token]
            if self.eos_token else None,
        )

    def decode(self, id_seqs):
        self.tokenizer.decode_batch(id_seqs)
Beispiel #10
0
    def test_encode(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can encode single sequence
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name", "is", "john"]
        assert type(output.ids) == list
        assert type(output.type_ids) == list
        assert type(output.offsets) == list
        assert type(output.words) == list
        assert type(output.special_tokens_mask) == list
        assert type(output.attention_mask) == list
        assert type(output.overflowing) == list

        # Can encode a pair of sequences
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "name", "is", "john", "pair"]

        # Can encode a batch with both a single sequence and a pair of sequences
        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
        assert len(output) == 2
}
k = len(output_vocab)
with open("../data/res2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w] = k
        k += 1
with open("../data/arg2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w.replace('-', '_')] = k
        k += 1

output_vocab = {w: i for i, w in enumerate(output_vocab)}
output_tokenizer = Tokenizer(WordLevel(output_vocab, ))
output_tokenizer.pre_tokenizer = Whitespace()

t = output_tokenizer.encode_batch(
    ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"])
# print (t)

csv_file = '../data/seq2seq_4335716.csv'
input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_tokenizer.bos_token = input_tokenizer.cls_token
input_tokenizer.eos_token = input_tokenizer.sep_token

val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]')
train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]')
# print(val_data)
# print(train_data)

batch_size = 16  # 4 but change to 16 for full training
encoder_max_length = 128
decoder_max_length = 128
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                out_f.write(
                    json.dumps((examples_per_file[input_file],
                                len(label_counter))) + '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    for example, labels in g:

                        example_batch.append(example)
                        labels_batch.append(labels)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels in zip(example_batch,
                                                       labels_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                out_f.write(
                                    json.dumps([example.ids, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels in zip(example_batch, labels_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        out_f.write(json.dumps([example.ids, labels]) + '\n')
Beispiel #13
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Runtime')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    # read IR
    log.info('Reading model {}'.format(args.model))
    model = core.read_model(args.model)

    # check number inputs and outputs
    if len(model.inputs) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(model.inputs)))
    if len(model.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(model.outputs)))
    input_tensor = model.inputs[0].any_name

    if not args.dynamic_shape and (
            model.inputs[0].partial_shape.is_dynamic
            or model.inputs[0].shape[1] != args.max_seq_len):
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(args.max_seq_len)])
        })

    if args.dynamic_shape:
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(0, args.max_seq_len)])
        })

    # load model to the device
    compiled_model = core.compile_model(model, args.device)
    output_tensor = compiled_model.outputs[0]
    infer_request = compiled_model.create_infer_request()
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = args.max_seq_len

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            model_input = input_ids
            if not args.dynamic_shape:
                # pad the rest of the request
                pad_len = max_length - cur_input_len
                model_input = np.concatenate(
                    (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for OpenVINO runtime
            inputs = {
                input_tensor: model_input,
            }

            # infer by OpenVINO runtime
            t_start = time.perf_counter()
            outputs = infer_request.infer(inputs)[output_tensor]
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(model_input.shape[1], 1 / (t_end - t_start),
                        t_end - t_start))

            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)".
            format(t_count, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))
Beispiel #14
0
class SentencePieceBPETokenizer:
    """Custom SentencePiece tokenizer"""
    unk_token = '<unk>'
    pad_token = '<pad>'

    def __init__(self,
                 vocab: Dict[str, int] = None,
                 merges: List[Tuple[str, str]] = None,
                 dropout: float = None,
                 max_length: Optional[int] = 64) -> None:
        """Constructor

        Args:
            vocab (Dict[str, int]): A dictionary of string keys and their ids.
            merges (List[Tuple[str, str]]): A list of pairs of tokens.
            dropout (float): BPE dropout
            max_length (int, optional): The max length at which to truncate.
                Defaults to `64`.
        """
        self.tokenizer = Tokenizer(
            BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token))
        self.tokenizer.normalizer = BertNormalizer()  # noqa
        self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()  # noqa
        self.tokenizer.decoder = decoders.Metaspace()  # noqa
        self.tokenizer.add_special_tokens([self.pad_token, self.unk_token])

        self.tokenizer.enable_padding(pad_token=self.pad_token)
        self.tokenizer.enable_truncation(max_length)

    @classmethod
    def train(cls,
              dataset: Sequence[str],
              vocab_size: int = 1000,
              min_frequency: int = 2,
              dropout: float = 0.0,
              max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer':
        instance = cls(dropout=dropout, max_length=max_length)
        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=[cls.pad_token, cls.unk_token])
        instance.tokenizer.train_from_iterator(dataset, trainer=trainer)
        instance.tokenizer.model.dropout = None
        return instance

    @property
    def vocab_size(self):
        return len(self.tokenizer.get_vocab())

    def serialize(self):
        return self.tokenizer.to_str()

    @classmethod
    def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer':
        tokenizer = cls()
        tokenizer.tokenizer = Tokenizer.from_str(s)
        return tokenizer

    def encode(self, text: str) -> Dict[str, Any]:
        encoding = self.tokenizer.encode(text)
        outputs = {
            'ids': torch.tensor(encoding.ids),
            'mask': torch.tensor(encoding.attention_mask),
            'spans': encoding.offsets,
        }
        return outputs

    def encode_batch(self, batch: List[str]):
        encodings = self.tokenizer.encode_batch(batch)
        outputs = {
            'ids': torch.tensor([e.ids for e in encodings]),
            'mask': torch.tensor([e.attention_mask for e in encodings]),
            'spans': [e.offsets for e in encodings],
        }
        return outputs
Beispiel #15
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    ie = IECore()

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info('Reading model {}'.format(args.model))
    ie_net = ie.read_network(model=model_xml, weights=model_bin)

    # check input and output names
    if len(ie_net.input_info) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(ie_net.input_info)))
    if len(ie_net.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(ie_net.outputs)))
    input_names = next(iter(ie_net.input_info))
    output_names = next(iter(ie_net.outputs))

    # load model to the device
    ie_net_exec = ie.load_network(network=ie_net, device_name=args.device)
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = ie_net.input_info[input_names].input_data.shape[1]

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            # pad the rest of the request
            pad_len = max_length - cur_input_len
            model_input = np.concatenate(
                (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for IE
            inputs = {
                input_names: model_input,
            }

            # infer by IE
            t_start = time.perf_counter()
            res = ie_net_exec.infer(inputs=inputs)
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(max_length, 1 / (t_end - t_start), t_end - t_start))

            outputs = res[output_names]
            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)"
            .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))
"""
# Load a pre-trained tokenizer
# 读取一个预训练的分词器
merges = "./saved_tokenizer/wiki_sunyang/merges.txt"
vocab = "./saved_tokenizer/wiki_sunyang/vocab.json"
bpe = models.BPE.from_files(vocab, merges)

# Initialize a tokenizer
# 初始化一个分词器
tokenizer = Tokenizer(bpe)

# Customize pre-tokenization and decoding
# 定制一个预训练分词器和解码器
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()

# And then encode
# 然后就可以编码了
encoded = tokenizer.encode(
    "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming."
)
print(encoded.ids)
print(encoded.tokens)

# Or tokenize multiple sentences at once:
# 可以一次性编码一批句子
encoded = tokenizer.encode_batch([
    "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming.",
    "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming."
])
print(encoded)
Beispiel #17
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')
Beispiel #18
0

import tflex_utils
import tqdm
import time
start = time.time()
optional_pair_sequence = None
tokens = []
if args.batch:
    with open(args.in_text) as f:
        print('Reading...')
        lines = f.readlines()
        print(repr(lines[0]))
    batches = [x for x in group(args.step, lines, fillvalue='\n')]
    for batch in tqdm.tqdm(batches):
        for encoding in tokenizer.encode_batch([x for x in batch]):
            tokens.extend(encoding.ids)
            elapsed = time.time() - start
            print('%d tokens in %.4fs (%.4f tokens/sec)' %
                  (len(tokens), elapsed, len(tokens) / elapsed))
else:
    for i, line in tflex_utils.for_each_line(args.in_text):
        encoding = tokenizer.encode(line, optional_pair_sequence)
        tokens.extend(encoding.ids)
        if i % args.step == 0:
            elapsed = time.time() - start
            print('%d tokens in %.4fs (%.4f tokens/sec)' %
                  (len(tokens), elapsed, len(tokens) / elapsed))
elapsed = time.time() - start
print('%d tokens in %.4fs (%.4f tokens/sec)' %
      (len(tokens), elapsed, len(tokens) / elapsed))