def tokenize_corpus( input_file: str, output_file: str, vocab_file: str, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Tokenize corpus sentences through trained **WordPiece** model. Arguments: input_file (str): Input corpus file path. output_file (str): Output file path. vocab_file (str): Trained vocabulary file path. unk_token (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. """ # Create `WordPiece` model and add special tokens. Note that `unk_token` # is also a special token.normalizer and pre-tokenizer. tokenizer = Tokenizer(models.WordPiece(vocab_file, unk_token=unk_token)) tokenizer.add_special_tokens([unk_token] + control_tokens) # Use BERT-specific normalizer, pre-tokenizer and **WordPiece** decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = decoders.WordPiece(prefix='##') with open(input_file, 'r', encoding='utf-8') as src, \ open(output_file, 'w', encoding='utf-8') as dst: # Count total lines in corpus. total_lines = 0 for _ in src: total_lines += 1 # Move the corpus file to first. src.seek(0) buffer = [] for line in tqdm.tqdm(src, desc='[*] tokenize corpus', total=total_lines): buffer.append(line) # Tokenize buffered sentences and write to `output_file`. if len(buffer) > 10000: for t in tokenizer.encode_batch(buffer): dst.write(' '.join(t.tokens) + '\n') buffer.clear() # Process the remained buffer. if buffer: for t in tokenizer.encode_batch(buffer): dst.write(' '.join(t.tokens) + '\n')
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list with pytest.warns(DeprecationWarning): assert type(output.words) == list assert type(output.word_ids) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] assert isinstance(pickle.loads(pickle.dumps(output)), Encoding) # Can encode a single pre-tokenized sequence output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["my", "name", "is", "john"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((DataTrainingArguments, CustomOthersArguments)) (data_args, custom_args) = parser.parse_args_into_dataclasses() train_files = list(sorted(glob.glob(f'{data_args.train_dir}/*.{data_args.ext}'))) validation_files = list(sorted(glob.glob(f'{data_args.eval_dir}/*.{data_args.ext}'))) additional_special_tokens = ADDITIONAL_SPECIAL_TOKENS pre_tokenizer_func = PRE_TOKENIZERS_MAP.get(custom_args.pre_tokenizer_type, None) if pre_tokenizer_func is None: raise NotImplementedError elif custom_args.pre_tokenizer_type == 'sefr_cut': raise ValueError('sefr_cut is slow use fake_sefr_cu with sefr_cut_pre_tokenizer instead') if not os.path.exists(custom_args.output_file) or custom_args.overwrite_output_file: trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func, vocab_size=custom_args.vocab_size, vocab_min_freq=custom_args.vocab_min_freq, input_files=train_files, additional_special_tokens=additional_special_tokens) trainer.count_parallel() trainer.save_vocab(custom_args.output_file) if custom_args.pre_tokenizer_type == 'fake_sefr_cut': custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom( FakeSefrCustomTokenizer(PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token'])) else: custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom( CustomPreTokenizer(pre_tokenizer_func)) tokenizer = Tokenizer(models.WordLevel.from_file(custom_args.output_file, unk_token='<unk>')) tokenizer.pre_tokenizer = custom_pre_tokenizer if custom_args.debug: print('Tokenize following text.') texts = ['<s>โรนัลโดเขาได้เล่นกับทีม</s>', 'โปรตุเกสมีโรนัลโด', 'โรนัลโดเขาได้เล่นกับทีม\nโปรตุเกสมีโรนัลโด'] ids = [e.ids for e in tokenizer.encode_batch(texts)] decoded_texts = tokenizer.decode_batch(ids) decoded_texts = [text.replace(' ', '') for text in decoded_texts] for text, i, decoded_text in zip(texts, ids, decoded_texts): print('Text: ', text, '>>', 'Tokenized: ', i, '>>', 'Decoded: ', decoded_text) with open(validation_files[0], 'r') as f: while True: line = f.readline() if line: line = line.strip() if len(line) > 0 and not line.isspace(): encoded = tokenizer.encode(line) decoded = tokenizer.decode(encoded.ids).replace(' ', '') print('Text: ', line, '>>', encoded.ids, '>>', decoded) else: break
def __init__(self, tokenizer: Tokenizer, args, file_paths: str, block_size=512): assert all([os.path.isfile(file_path) for file_path in file_paths]) block_size = block_size - 2 # Reduce by 2 to account for [CLS] and [SEP] tokens directory, filename = os.path.split(file_paths[0]) cached_features_file = os.path.join( directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + Path(filename).stem) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info("Reading dataset at %s", file_paths) text = [] for file_path in file_paths: with open(file_path, encoding="utf-8") as f: text += f.readlines() logger.info("Creating features from dataset file at %s", directory) # Get all token IDs except [CLS] and [SEP] and flat map IDs tokenized_text = [ t for tokenized in tokenizer.encode_batch(text) for t in tokenized.ids[1:-1] ] cls_token, sep_token = tokenizer.encode('').ids self.examples = [] for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append([cls_token] + tokenized_text[i:i + block_size] + [sep_token]) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. logger.info("Saving features into cached file %s", cached_features_file) Path(cached_features_file).parent.mkdir(exist_ok=True, parents=True) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
class Wikitext2RawConverter(BaseFormatConverter): __provider__ = "wikitext2raw" annotation_types = (LanguageModelingAnnotation, ) @classmethod def parameters(cls): configuration_parameters = super().parameters() configuration_parameters.update({ 'testing_file': PathField(description="Path to testing file."), 'merges_file': PathField(description="Path to merges file."), 'vocab_file': PathField(description='Path to vocabulary file.'), 'max_seq_length': NumberField( description='The maximum total input sequence length after tokenization.', optional=True, default=128, value_type=int ), }) return configuration_parameters def configure(self): self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer(BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel() def convert(self, check_content=False, progress_callback=None, progress_interval=100, **kwargs): with open(self.testing_file, encoding="utf-8") as f: text = f.read() tokens = self.tokenizer.encode_batch([text]) encoding = tokens[0] annotations = [] unique_id = 1000000000 for idx in range(0, len(encoding.ids) - self.max_seq_length + 1, self.max_seq_length): ids = encoding.ids[idx: idx + self.max_seq_length] tokens = encoding.tokens[idx:idx + self.max_seq_length] identifier = ['input_ids_{}'.format(idx), 'labels_{}'.format(idx)] annotation = LanguageModelingAnnotation( identifier, np.array(unique_id), np.array([ids]), tokens, labels=np.array(ids), ) annotations.append(annotation) unique_id += 1 return ConverterReturn(annotations, None, None)
def test_padding(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # By default it does nothing when encoding single sequence tokenizer.enable_padding() output = tokenizer.encode("my name") assert output.tokens == ["my", "name"] # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) assert all([len(encoding) == 4 for encoding in output]) # Can pad to the specified max length otherwise tokenizer.enable_padding(max_length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] output = tokenizer.encode("my name", "pair") assert output.tokens == ["my", "name", "pair", "[PAD]"]
def __init__(self, tokenizer: Tokenizer, args, file_paths: str, block_size=512): assert all([os.path.isfile(file_path) for file_path in file_paths]) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_paths[0]) lines = [] for file_path in file_paths: with open(file_path, encoding="utf-8") as f: lines += [ line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()) ] self.examples = truncate([x.id for x in tokenizer.encode_batch(lines)])
class HuggingFaceWordLevelTokenizer(TokenizerBase): def __init__(self, **kwargs): super().__init__(**kwargs) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers self.tokenizer = Tokenizer( models.WordLevel(unk_token=self.unknown_token)) self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() if self.lower: self.tokenizer.normalizer = normalizers.Lowercase() def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None): from tokenizers import trainers trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size, special_tokens=list( self.special_tokens)) self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer) self.token_to_id = self.tokenizer.get_vocab() self.id_to_token = { token_id: token for token, token_id in self.token_to_id.items() } def encode(self, texts): id_seqs = self.tokenizer.encode_batch(texts) id_seqs = [id_seq.ids for id_seq in id_seqs] return self._post_process( id_seqs, pad_id=self.token_to_id[self.pad_token] if self.pad_token else None, sos_id=self.token_to_id[self.sos_token] if self.sos_token else None, eos_id=self.token_to_id[self.eos_token] if self.eos_token else None, ) def decode(self, id_seqs): self.tokenizer.decode_batch(id_seqs)
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list assert type(output.words) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
} k = len(output_vocab) with open("../data/res2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w] = k k += 1 with open("../data/arg2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w.replace('-', '_')] = k k += 1 output_vocab = {w: i for i, w in enumerate(output_vocab)} output_tokenizer = Tokenizer(WordLevel(output_vocab, )) output_tokenizer.pre_tokenizer = Whitespace() t = output_tokenizer.encode_batch( ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"]) # print (t) csv_file = '../data/seq2seq_4335716.csv' input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_tokenizer.bos_token = input_tokenizer.cls_token input_tokenizer.eos_token = input_tokenizer.sep_token val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]') train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]') # print(val_data) # print(train_data) batch_size = 16 # 4 but change to 16 for full training encoder_max_length = 128 decoder_max_length = 128
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) # check number inputs and outputs if len(model.inputs) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(model.inputs))) if len(model.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(model.outputs))) input_tensor = model.inputs[0].any_name if not args.dynamic_shape and ( model.inputs[0].partial_shape.is_dynamic or model.inputs[0].shape[1] != args.max_seq_len): model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(args.max_seq_len)]) }) if args.dynamic_shape: model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(0, args.max_seq_len)]) }) # load model to the device compiled_model = core.compile_model(model, args.device) output_tensor = compiled_model.outputs[0] infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = args.max_seq_len eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: model_input = input_ids if not args.dynamic_shape: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for OpenVINO runtime inputs = { input_tensor: model_input, } # infer by OpenVINO runtime t_start = time.perf_counter() outputs = infer_request.infer(inputs)[output_tensor] t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(model_input.shape[1], 1 / (t_end - t_start), t_end - t_start)) next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)". format(t_count, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
class SentencePieceBPETokenizer: """Custom SentencePiece tokenizer""" unk_token = '<unk>' pad_token = '<pad>' def __init__(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None, dropout: float = None, max_length: Optional[int] = 64) -> None: """Constructor Args: vocab (Dict[str, int]): A dictionary of string keys and their ids. merges (List[Tuple[str, str]]): A list of pairs of tokens. dropout (float): BPE dropout max_length (int, optional): The max length at which to truncate. Defaults to `64`. """ self.tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token)) self.tokenizer.normalizer = BertNormalizer() # noqa self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # noqa self.tokenizer.decoder = decoders.Metaspace() # noqa self.tokenizer.add_special_tokens([self.pad_token, self.unk_token]) self.tokenizer.enable_padding(pad_token=self.pad_token) self.tokenizer.enable_truncation(max_length) @classmethod def train(cls, dataset: Sequence[str], vocab_size: int = 1000, min_frequency: int = 2, dropout: float = 0.0, max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer': instance = cls(dropout=dropout, max_length=max_length) trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[cls.pad_token, cls.unk_token]) instance.tokenizer.train_from_iterator(dataset, trainer=trainer) instance.tokenizer.model.dropout = None return instance @property def vocab_size(self): return len(self.tokenizer.get_vocab()) def serialize(self): return self.tokenizer.to_str() @classmethod def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer': tokenizer = cls() tokenizer.tokenizer = Tokenizer.from_str(s) return tokenizer def encode(self, text: str) -> Dict[str, Any]: encoding = self.tokenizer.encode(text) outputs = { 'ids': torch.tensor(encoding.ids), 'mask': torch.tensor(encoding.attention_mask), 'spans': encoding.offsets, } return outputs def encode_batch(self, batch: List[str]): encodings = self.tokenizer.encode_batch(batch) outputs = { 'ids': torch.tensor([e.ids for e in encodings]), 'mask': torch.tensor([e.attention_mask for e in encodings]), 'spans': [e.offsets for e in encodings], } return outputs
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info('Reading model {}'.format(args.model)) ie_net = ie.read_network(model=model_xml, weights=model_bin) # check input and output names if len(ie_net.input_info) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(ie_net.input_info))) if len(ie_net.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(ie_net.outputs))) input_names = next(iter(ie_net.input_info)) output_names = next(iter(ie_net.outputs)) # load model to the device ie_net_exec = ie.load_network(network=ie_net, device_name=args.device) log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = ie_net.input_info[input_names].input_data.shape[1] eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for IE inputs = { input_names: model_input, } # infer by IE t_start = time.perf_counter() res = ie_net_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(max_length, 1 / (t_end - t_start), t_end - t_start)) outputs = res[output_names] next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)" .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
""" # Load a pre-trained tokenizer # 读取一个预训练的分词器 merges = "./saved_tokenizer/wiki_sunyang/merges.txt" vocab = "./saved_tokenizer/wiki_sunyang/vocab.json" bpe = models.BPE.from_files(vocab, merges) # Initialize a tokenizer # 初始化一个分词器 tokenizer = Tokenizer(bpe) # Customize pre-tokenization and decoding # 定制一个预训练分词器和解码器 tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() # And then encode # 然后就可以编码了 encoded = tokenizer.encode( "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming." ) print(encoded.ids) print(encoded.tokens) # Or tokenize multiple sentences at once: # 可以一次性编码一批句子 encoded = tokenizer.encode_batch([ "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming.", "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming." ]) print(encoded)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')
import tflex_utils import tqdm import time start = time.time() optional_pair_sequence = None tokens = [] if args.batch: with open(args.in_text) as f: print('Reading...') lines = f.readlines() print(repr(lines[0])) batches = [x for x in group(args.step, lines, fillvalue='\n')] for batch in tqdm.tqdm(batches): for encoding in tokenizer.encode_batch([x for x in batch]): tokens.extend(encoding.ids) elapsed = time.time() - start print('%d tokens in %.4fs (%.4f tokens/sec)' % (len(tokens), elapsed, len(tokens) / elapsed)) else: for i, line in tflex_utils.for_each_line(args.in_text): encoding = tokenizer.encode(line, optional_pair_sequence) tokens.extend(encoding.ids) if i % args.step == 0: elapsed = time.time() - start print('%d tokens in %.4fs (%.4f tokens/sec)' % (len(tokens), elapsed, len(tokens) / elapsed)) elapsed = time.time() - start print('%d tokens in %.4fs (%.4f tokens/sec)' % (len(tokens), elapsed, len(tokens) / elapsed))