def samples_to_features_ner( sample, tasks, max_seq_len, tokenizer, non_initial_token="X", **kwargs ): """ Generates a dictionary of features for a given input sample that is to be consumed by an NER model. :param sample: Sample object that contains human readable text and label fields from a single NER data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :param non_initial_token: Token that is inserted into the label sequence in positions where there is a non-word-initial token. This is done since the default NER performs prediction only on word initial tokens :return: A list with one dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: list """ tokens = sample.tokenized["tokens"] if tokenizer.is_fast: text = sample.clear_text["text"] # Here, we tokenize the sample for the second time to get all relevant ids # This should change once we git rid of FARM's tokenize_with_metadata() inputs = tokenizer(text, return_token_type_ids=True, truncation=True, truncation_strategy="longest_first", max_length=max_seq_len, return_special_tokens_mask=True) if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " f"from number of tokens produced in tokenize_with_metadata().\n" f"Further processing is likely to be wrong!") else: inputs = tokenizer.encode_plus(text=tokens, text_pair=None, add_special_tokens=True, truncation=False, return_special_tokens_mask=True, return_token_type_ids=True, is_pretokenized=False ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"] # We construct a mask to identify the first token of a word. We will later only use them for predicting entities. # Special tokens don't count as initial tokens => we add 0 at the positions of special tokens # For BERT we add a 0 in the start and end (for CLS and SEP) initial_mask = [int(x) for x in sample.tokenized["start_of_word"]] initial_mask = insert_at_special_tokens_pos(initial_mask, special_tokens_mask, insert_element=0) assert len(initial_mask) == len(input_ids) for task_name, task in tasks.items(): try: label_list = task["label_list"] label_name = task["label_name"] label_tensor_name = task["label_tensor_name"] labels_word = sample.clear_text[label_name] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) # labels_token = add_cls_sep(labels_token, cls_token, sep_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: # For inference mode we don't expect labels label_ids = None logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!" "\nIf your are running in *inference* mode: Don't worry!" "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") # This mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left) if label_ids: label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } if label_ids: feature_dict[label_tensor_name] = label_ids return [feature_dict]
def samples_to_features_ner(sample, label_list, max_seq_len, tokenizer, cls_token="[CLS]", pad_token="[PAD]", sep_token="[SEP]", non_initial_token="X", **kwargs): """ Generates a dictionary of features for a given input sample that is to be consumed by an NER model. :param sample: Sample object that contains human readable text and label fields from a single NER data sample :type sample: Sample :param label_list: A list of all unique labels :type label_list: list :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :param cls_token: Token used to represent the beginning of the sequence :type cls_token: str :param pad_token: Token used to represent sequence padding :type pad_token: str :param sep_token: Token used to represent the border between two sequences :type sep_token: str :param non_initial_token: Token that is inserted into the label sequence in positions where there is a non-word-initial token. This is done since the default NER performs prediction only on word initial tokens :return: A dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: dict """ # Tokenize words and extend the labels so they are aligned with the tokens # words = sample.clear_text["text"].split(" ") # tokens, initial_mask = words_to_tokens(words, tokenizer, max_seq_len) tokens = sample.tokenized["tokens"] initial_mask = [int(x) for x in sample.tokenized["start_of_word"]] # initial_mask = # Add CLS and SEP tokens tokens = add_cls_sep(tokens, cls_token, sep_token) initial_mask = [0] + initial_mask + [ 0 ] # CLS and SEP don't count as initial tokens padding_mask = [1] * len(tokens) # Convert to input and labels to ids, generate masks input_ids = tokenizer.convert_tokens_to_ids(tokens) if "label" in sample.clear_text: labels_word = sample.clear_text["label"] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) # labels_token = add_cls_sep(labels_token, cls_token, sep_token) label_ids = [label_list.index(lt) for lt in labels_token] # Inference mode else: label_ids = None segment_ids = [0] * max_seq_len # Pad input_ids = pad(input_ids, max_seq_len, 0) if label_ids: label_ids = pad(label_ids, max_seq_len, 0) initial_mask = pad(initial_mask, max_seq_len, 0) padding_mask = pad(padding_mask, max_seq_len, 0) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } if label_ids: feature_dict["label_ids"] = label_ids return [feature_dict]
def samples_to_features_ner(sample, tasks, max_seq_len, tokenizer, cls_token="[CLS]", sep_token="[SEP]", non_initial_token="X", **kwargs): """ Generates a dictionary of features for a given input sample that is to be consumed by an NER model. :param sample: Sample object that contains human readable text and label fields from a single NER data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :param cls_token: Token used to represent the beginning of the sequence :type cls_token: str :param sep_token: Token used to represent the border between two sequences :type sep_token: str :param non_initial_token: Token that is inserted into the label sequence in positions where there is a non-word-initial token. This is done since the default NER performs prediction only on word initial tokens :return: A dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: dict """ # Tokenize words and extend the labels so they are aligned with the tokens # words = sample.clear_text["text"].split(" ") # tokens, initial_mask = words_to_tokens(words, tokenizer, max_seq_len) tokens = sample.tokenized["tokens"] custom_data = sample.tokenized["custom_data"] initial_mask = [int(x) for x in sample.tokenized["start_of_word"]] # initial_mask = # Add CLS and SEP tokens tokens = add_cls_sep(tokens, cls_token, sep_token) custom_data = [5] + custom_data + [4] initial_mask = [0] + initial_mask + [ 0 ] # CLS and SEP don't count as initial tokens padding_mask = [1] * len(tokens) # Convert to input and labels to ids, generate masks input_ids = tokenizer.convert_tokens_to_ids(tokens) for task_name, task in tasks.items(): try: label_list = task["label_list"] label_name = task["label_name"] label_tensor_name = task["label_tensor_name"] labels_word = sample.clear_text[label_name] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) # labels_token = add_cls_sep(labels_token, cls_token, sep_token) #label_ids = [label_list.index(lt) for lt in labels_token] label_ids = [ label_list.index(lt) for lt in sample.tokenized['ner_label'] ] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) logger.warning( f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: # For inference mode we don't expect labels label_ids = None logger.warning( f"[Task: {task_name}] Could not convert labels to ids via label_list!" "\nIf your are running in *inference* mode: Don't worry!" "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are corre" ) label_ids = [5] + label_ids + [4] segment_ids = [] next_sent = False for x in input_ids: if x == 102: segment_ids.append(0) next_sent = True elif next_sent == True: segment_ids.append(1) else: segment_ids.append(0) segment_ids[len(segment_ids) - 1] = 1 # Pad input_ids = pad(input_ids, max_seq_len, 0) if label_ids: label_ids = pad(label_ids, max_seq_len, 0) initial_mask = pad(initial_mask, max_seq_len, 0) padding_mask = pad(padding_mask, max_seq_len, 0) custom_data = pad(custom_data, max_seq_len, 0) segment_ids = pad(segment_ids, max_seq_len, 0) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, "custom_data": custom_data, } if label_ids: feature_dict[label_tensor_name] = label_ids return [feature_dict]
def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] self.pre_tokenizer = WhitespaceSplit() texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True, ) for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) padding_mask = tokenized.attention_mask if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_name = task["label_name"] labels_word = d[label_name] label_list = task["label_list"] label_tensor_name = task["label_tensor_name"] if task["task_type"] == "classification": label_ids = [label_list.index(labels_word)] elif task["task_type"] == "ner": labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: label_ids = None # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" # "\nIf your are running in *inference* mode: Don't worry!" # "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] if return_baskets: ret.append(self.baskets) return tuple(ret)