def get_word_labels_from_token_labels( hf_arch: str, # A Hugging Face tokenizer hf_tokenizer: PreTrainedTokenizerBase, # A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...]) tok_labels, ) -> List[Tuple[str, str]]: """ Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the "word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method, allows the user to reconstruct the orginal raw inputs and labels. """ # recreate raw words list (we assume for token classification that the input is a list of words) words = hf_tokenizer.convert_tokens_to_string( [tok_label[0] for tok_label in tok_labels]).split() if hf_arch == "canine": word_list = [f"{word} " for word in words] else: word_list = [word for word in words] # align "words" with labels word_labels, idx = [], 0 for word in word_list: word_labels.append((word, tok_labels[idx][1])) idx += len(hf_tokenizer.tokenize(word)) return word_labels
def __init__(self, vocab_file, tokenizer_file=None, eos_token="</s>", unk_token="<unk>", pad_token="<pad>", extra_ids=100, vis_extra_ids=100, additional_special_tokens=None, **kwargs): # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: additional_special_tokens = [ "<extra_id_{}>".format(i) for i in range(extra_ids) ] elif extra_ids > 0 and additional_special_tokens is not None: # Check that we have the right number of extra_id special tokens extra_tokens = len( set( filter(lambda x: bool("extra_id" in x), additional_special_tokens))) if extra_tokens != extra_ids: raise ValueError( f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are provided to T5Tokenizer. " "In this case the additional_special_tokens must include the extra_ids tokens" ) if vis_extra_ids > 0: additional_special_tokens.extend( ["<vis_extra_id_{}>".format(i) for i in range(vis_extra_ids)]) slow_tokenizer = self.slow_tokenizer_class( vocab_file, tokenizer_file=tokenizer_file, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, extra_ids=extra_ids, vis_extra_ids=vis_extra_ids, # additional_special_tokens=additional_special_tokens, **kwargs) fast_tokenizer = convert_slow_vlt5tokenizer(slow_tokenizer) self._tokenizer = fast_tokenizer PreTrainedTokenizerBase.__init__( self, tokenizer_file=tokenizer_file, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, extra_ids=extra_ids, vis_extra_ids=vis_extra_ids, additional_special_tokens=additional_special_tokens, **kwargs, ) self.vocab_file = vocab_file self._extra_ids = extra_ids self._vis_extra_ids = vis_extra_ids
def batch_encode_pretokenized(tokenizer: transformers.PreTrainedTokenizerBase, tokenized_inputs: List[List[str]], tokenized_pair_inputs: Optional[List[ List[str]]] = None, tensor_type="tf", **kw) -> transformers.BatchEncoding: """Batch encode pre-tokenized text, without further splitting. This is necessary because tokenizer(..., is_split_into_words=True) doesn't guarantee that tokens will stay intact - only that the final tokens will not span the given boundaries. If the tokenizer is called directly, you'll get things like: "foo" "##bar" -> "foo" "#" "#" "bar" Based on the implementation of batch_encode_plus in https://github.com/huggingface/transformers/blob/v4.1.1/src/transformers/tokenization_utils_base.py#L2489 Args: tokenizer: Transformers tokenizer tokenized_inputs: list of tokenized inputs tokenized_pair_inputs: (optional) list of tokenized second-segment inputs tensor_type: tensor type to return **kw: additional args, forwarded to tokenizer.prepare_for_model Returns: BatchEncoding, suitable for model input """ encoded_input = {} tokenized_pair_inputs = (tokenized_pair_inputs or [None] * len(tokenized_inputs)) for tokens, pair_tokens in zip(tokenized_inputs, tokenized_pair_inputs): ids = tokenizer.convert_tokens_to_ids(tokens) pair_ids = (tokenizer.convert_tokens_to_ids(pair_tokens) if pair_tokens is not None else None) encoded = tokenizer.prepare_for_model(ids, pair_ids=pair_ids, add_special_tokens=True, padding="do_not_pad", truncation="longest_first", return_attention_mask=False, pad_to_multiple_of=False, **kw) for k, v in encoded.items(): encoded_input.setdefault(k, []).append(v) encoded_input = tokenizer.pad(encoded_input, padding="longest", return_attention_mask=True) return transformers.BatchEncoding(encoded_input, tensor_type=tensor_type)
def convert_to_features( examples: Any, tokenizer: PreTrainedTokenizerBase, padding: str, max_source_length: int, max_target_length: int, src_text_column_name: str, tgt_text_column_name: str, ): translations = examples[ "translation"] # Extract translations from dict def extract_text(lang): return [text[lang] for text in translations] src_texts = extract_text(src_text_column_name) src_texts = ["Translate from source text: " + src for src in src_texts] encoded_results = tokenizer.prepare_seq2seq_batch( src_texts=src_texts, tgt_texts=extract_text(tgt_text_column_name), max_length=max_source_length, max_target_length=max_target_length, padding=padding, ) return encoded_results
def get_token_labels_from_input_ids( # A Hugging Face tokenizer hf_tokenizer: PreTrainedTokenizerBase, # List of input_ids for the tokens in a single piece of processed text input_ids: List[int], # List of label indexs for each token token_label_ids: List[int], # List of label names from witch the `label` indicies can be used to find the name of the label vocab: List[str], # The token ID that should be ignored when calculating the loss ignore_token_id: int = CrossEntropyLossFlat().ignore_index, # The token used to identifiy ignored tokens (default: [xIGNx]) ignore_token: str = "[xIGNx]", ) -> List[Tuple[str, str]]: """ Given a list of input IDs, the label ID associated to each, and the labels vocab, this method will return a list of tuples whereby each tuple defines the "token" and its label name. For example: [('ĠWay', B-PER), ('de', B-PER), ('ĠGill', I-PER), ('iam', I-PER), ('Ġloves'), ('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG)] """ # convert ids to tokens toks = hf_tokenizer.convert_ids_to_tokens(input_ids) # align "tokens" with labels tok_labels = [ (tok, ignore_token if label_id == ignore_token_id else vocab[label_id]) for tok_id, tok, label_id in zip(input_ids, toks, token_label_ids) if tok_id not in hf_tokenizer.all_special_ids ] return tok_labels
def get_tokens_and_offsets( text: str, tokenizer: PreTrainedTokenizerBase) -> List[Tuple[Any, int, int]]: tokens = tokenizer.tokenize(text) token_lens = [len(token) for token in tokens] token_lens[0] -= 1 # Ignore first "_" token token_ends = np.cumsum(token_lens) token_starts = [0] + token_ends[:-1].tolist() tokens_and_offsets = list(zip(tokens, token_starts, token_ends)) return tokens_and_offsets
def blurr_sort_func( example, # A Hugging Face tokenizer hf_tokenizer: PreTrainedTokenizerBase, # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True` # if your inputs are pre-tokenized (not numericalized) is_split_into_words: bool = False, # Any other keyword arguments you want to include during tokenization tok_kwargs: dict = {}, ): """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization""" txt = example[0]["text"] if isinstance(example[0], dict) else example[0] return len(txt) if is_split_into_words else len( hf_tokenizer.tokenize(txt, **tok_kwargs))
def get_hf_objects( pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], model_cls: PreTrainedModel, config: Union[PretrainedConfig, str, os.PathLike] = None, tokenizer_cls: PreTrainedTokenizerBase = None, config_kwargs: dict = {}, tokenizer_kwargs: dict = {}, model_kwargs: dict = {}, cache_dir: Union[str, os.PathLike] = None ) -> Tuple[str, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel]: """ Given at minimum a `pretrained_model_name_or_path` and `model_cls (such as `AutoModelForSequenceClassification"), this method returns all the Hugging Face objects you need to train a model using Blurr """ # config if config is None: config = AutoConfig.from_pretrained(pretrained_model_name_or_path, cache_dir=cache_dir, **config_kwargs) # tokenizer (gpt2, roberta, bart (and maybe others) tokenizers require a prefix space) if any(s in pretrained_model_name_or_path for s in ["gpt2", "roberta", "bart", "longformer"]): tokenizer_kwargs = {**{"add_prefix_space": True}, **tokenizer_kwargs} if tokenizer_cls is None: tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, cache_dir=cache_dir, **tokenizer_kwargs) else: tokenizer = tokenizer_cls.from_pretrained( pretrained_model_name_or_path, cache_dir=cache_dir, **tokenizer_kwargs) # model model = model_cls.from_pretrained(pretrained_model_name_or_path, config=config, cache_dir=cache_dir, **model_kwargs) # arch try: arch = model.__module__.split(".")[2] except: arch = "unknown" return (arch, config, tokenizer, model)
def convert_to_features( examples: Any, tokenizer: PreTrainedTokenizerBase, padding: str, max_source_length: int, max_target_length: int, src_text_column_name: str, tgt_text_column_name: str, ): encoded_results = tokenizer.prepare_seq2seq_batch( src_texts=examples[src_text_column_name], tgt_texts=examples[tgt_text_column_name], max_length=max_source_length, max_target_length=max_target_length, padding=padding, ) return encoded_results
def __init__(self, tokenizer: PreTrainedTokenizerBase, file_path: str, block_size: int = 512, overwrite_cache=False): super(TextDataset, self).__init__() self.path = file_path assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), filename, ), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.data = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: logger.info( f"Creating features from dataset file at {directory}") self.data = [] with open(file_path, encoding="utf-8") as f: for each_line in f: obj = json.loads(each_line) tokenized_source = tokenizer.encode( obj['source'], truncation=True, max_length=block_size, padding=True) tokenized_target = tokenizer.encode( obj['target'], truncation=True, max_length=block_size, padding=True) self.data.append((tokenized_source, tokenized_target)) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.data, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)