def test_all_tokenizer_on_special_cases(caplog): caplog.set_level(logging.CRITICAL) lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] tokenizers = [] for lang_name in lang_names: if "roberta" in lang_name: add_prefix_space = True else: add_prefix_space = False t = Tokenizer.load(lang_name, lower_case=False, add_prefix_space=add_prefix_space) tokenizers.append(t) texts = [ "This is a sentence", "Der entscheidende Pass", "力加勝北区ᴵᴺᵀᵃছজটডণত", "Thiso text is included tolod makelio sure Unicodeel is handled properly:", "This is a sentence...", "Let's see all on this text and. !23# neverseenwordspossible" "This is a sentence with multiple spaces", """This is a sentence. With linebreak""", """Sentence with multiple newlines """, "and another one\n\n\nwithout space", "This is a sentence with multiple tabs", ] expected_to_fail = [(1, 1), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 1), (2, 5)] for i_tok, tokenizer in enumerate(tokenizers): for i_text, text in enumerate(texts): # Important: we don't assume to preserve whitespaces after tokenization. # This means: \t, \n " " etc will all resolve to a single " ". # This doesn't make a difference for BERT + XLNet but it does for roBERTa test_passed = True # 1. original tokenize function from transformer repo on full sentence standardized_whitespace_text = ' '.join( text.split()) # remove multiple whitespaces tokenized = tokenizer.tokenize(standardized_whitespace_text) # 2. Our tokenization method using a pretokenizer which can normalize multiple white spaces # This approach is used in NER pre_tokenizer = WhitespaceSplit() words_and_spans = pre_tokenizer.pre_tokenize_str(text) words = [x[0] for x in words_and_spans] word_spans = [x[1] for x in words_and_spans] encoded = tokenizer.encode_plus( words, is_split_into_words=True, add_special_tokens=False).encodings[0] # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" if encoded.tokens != tokenized: test_passed = False # token offsets are originally relative to the beginning of the word # These lines convert them so they are relative to the beginning of the sentence token_offsets = [] for (start, end), w_index, in zip(encoded.offsets, encoded.words): word_start_ch = word_spans[w_index][0] token_offsets.append( (start + word_start_ch, end + word_start_ch)) if getattr(tokenizer, "add_prefix_space", None): token_offsets = [(start - 1, end) for start, end in token_offsets] # verify that offsets align back to original text if text == "力加勝北区ᴵᴺᵀᵃছজটডণত": # contains [UNK] that are impossible to match back to original text space continue for tok, (start, end) in zip(encoded.tokens, token_offsets): #subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them tok = re.sub(r"^(##|Ġ|▁)", "", tok) #tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok)) original_tok = text[start:end] if tok != original_tok: test_passed = False if (i_tok, i_text) in expected_to_fail: assert not test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'" else: assert test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'"
class MTLProcessor(Processor): def __init__( self, tokenizer, max_seq_len, data_dir, train_filename, test_filename, delimiter, dev_split=0.0, dev_filename=None, label_list=None, metric=None, proxies=None, **kwargs ): self.delimiter = delimiter super(MTLProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies ) def file_to_dicts(self, file: str) -> [dict]: dicts = list() df = pd.read_csv(file) for text, label, tokens in zip(df.sentence.values, df.label.values, df.trigger.values): columns = dict() text = ast.literal_eval(text) tokens = ast.literal_eval(tokens) columns["text"] = " ".join(text) columns["document_level_task_label"] = label # Key hard-coded columns["token_level_task_label"] = list(map(str, tokens)) # Key hard-coded dicts.append(columns) return dicts @staticmethod def _get_start_of_word(word_ids): words = np.array(word_ids) words[words == None] = -1 start_of_word_single = [0] + list(np.ediff1d(words) > 0) start_of_word_single = [int(x) for x in start_of_word_single] return start_of_word_single # Most of the code is copied from NERProcessor - dataset_from_dicts() def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] self.pre_tokenizer = WhitespaceSplit() texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True, ) for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) padding_mask = tokenized.attention_mask if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_name = task["label_name"] labels_word = d[label_name] label_list = task["label_list"] label_tensor_name = task["label_tensor_name"] if task["task_type"] == "classification": label_ids = [label_list.index(labels_word)] elif task["task_type"] == "ner": labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: label_ids = None # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" # "\nIf your are running in *inference* mode: Don't worry!" # "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] if return_baskets: ret.append(self.baskets) return tuple(ret)