def test_pack_to_length_text_higher(self): tokenized_text = ["i", "am", "going", "to", "write", "tests"] length = 3 tokenized_text_padded = pack_to_length( tokenized_text=tokenized_text, max_length=length, pad_token="<PAD>", add_start_end_token=False, ) assert tokenized_text_padded == ["i", "am", "going"]
def test_pack_to_length_text_equal_with_start_end_token(self): tokenized_text = ["i", "am", "going", "to", "write", "tests"] length = 6 tokenized_text_padded = pack_to_length( tokenized_text=tokenized_text, max_length=length, pad_token="<PAD>", add_start_end_token=True, start_token="<SOS>", end_token="<EOS>", ) assert tokenized_text_padded == ["<SOS>", "i", "am", "going", "to", "<EOS>"]
def setup_bow_elmo_encoder(request): layer_aggregation = request.param instances = [ "I like to eat carrot", "I like to go out on long drives in a car" ] padded_instances = [] for instance in instances: padded_inst = pack_to_length(tokenized_text=instance.split(), max_length=10) padded_instances.append(" ".join(padded_inst)) iter_dict = {"instance": padded_instances} bow_elmo_embedder = BowElmoEmbedder(layer_aggregation=layer_aggregation) return bow_elmo_embedder, iter_dict
def get_iter_dict(self, line: str, label: Optional[List[str]] = None): word_instance = self.word_tokenizer.tokenize(line) len_instance = len(word_instance) classnames2idx = ScienceIEDataset.get_classname2idx() idx2classname = {idx: classname for classname, idx in classnames2idx.items()} padded_word_instance = pack_to_length( tokenized_text=word_instance, max_length=self.max_instance_length, pad_token=self.word_vocab.pad_token, add_start_end_token=self.word_add_start_end_token, start_token=self.word_vocab.start_token, end_token=self.word_vocab.end_token, ) tokens = self.word_numericalizer.numericalize_instance(padded_word_instance) tokens = torch.LongTensor(tokens) len_tokens = torch.LongTensor([len_instance]) character_tokens = [] # 1. For every word we get characters in the word # 2. Pad the characters to max_char_length # 3. Convert them into numbers # 4. Add them to character_tokens for word in padded_word_instance: char_instance = self.char_tokenizer.tokenize(word) padded_character_instance = pack_to_length( tokenized_text=char_instance, max_length=self.max_char_length, pad_token=" ", add_start_end_token=False, ) padded_character_tokens = self.char_numericalizer.numericalize_instance( padded_character_instance ) character_tokens.append(padded_character_tokens) character_tokens = torch.LongTensor(character_tokens) instance_dict = { "tokens": tokens, "len_tokens": len_tokens, "instance": " ".join(padded_word_instance), "raw_instance": " ".join(word_instance), "char_tokens": character_tokens, } if label is not None: assert len_instance == len(label) task_labels = [] process_labels = [] material_labels = [] for string in label: task_label, process_label, material_label = string.split(":") task_labels.append(task_label) process_labels.append(process_label) material_labels.append(material_label) assert len_instance == len(task_labels) assert len_instance == len(process_labels) assert len_instance == len(material_labels) padded_task_labels = pack_to_length( tokenized_text=task_labels, max_length=self.max_instance_length, pad_token="padding-Task", add_start_end_token=self.word_add_start_end_token, start_token="starting-Task", end_token="ending-Task", ) padded_process_labels = pack_to_length( tokenized_text=process_labels, max_length=self.max_instance_length, pad_token="padding-Process", add_start_end_token=self.word_add_start_end_token, start_token="starting-Process", end_token="ending-Process", ) padded_material_labels = pack_to_length( tokenized_text=material_labels, max_length=self.max_instance_length, pad_token="padding-Material", add_start_end_token=self.word_add_start_end_token, start_token="starting-Material", end_token="ending-Material", ) assert ( len(padded_task_labels) == len(padded_process_labels) == len(padded_material_labels) ) padded_task_labels = [classnames2idx[label] for label in padded_task_labels] # Ugly offsetting because we are using continuous numbers for classes in all entity # types but science ie dataset requires 0 padded_process_labels = [ classnames2idx[label] for label in padded_process_labels ] padded_material_labels = [ classnames2idx[label] for label in padded_material_labels ] mask_task_label = [ 1 if idx2classname[class_idx] in self.ignore_labels else 0 for class_idx in padded_task_labels ] mask_process_label = [ 1 if idx2classname[class_idx] in self.ignore_labels else 0 for class_idx in padded_process_labels ] mask_material_label = [ 1 if idx2classname[class_idx] in self.ignore_labels else 0 for class_idx in padded_material_labels ] task_label = torch.LongTensor(padded_task_labels) process_label = torch.LongTensor(padded_process_labels) material_label = torch.LongTensor(padded_material_labels) mask_task_label = torch.ByteTensor(mask_task_label) mask_process_label = torch.ByteTensor(mask_process_label) mask_material_label = torch.ByteTensor(mask_material_label) label = torch.cat([task_label, process_label, material_label], dim=0) label_mask = torch.cat( [mask_task_label, mask_process_label, mask_material_label], dim=0 ) instance_dict["label"] = label instance_dict["label_mask"] = label_mask return instance_dict
def get_iter_dict(self, line: str, label: Optional[List[str]] = None): word_instance = self.word_tokenizer.tokenize(line) len_instance = len(word_instance) classnames2idx = ParscitDataset.get_classname2idx() idx2classname = {idx: classname for classname, idx in classnames2idx.items()} if self.instance_preprocessor is not None: word_instance = self.instance_preprocessor.lowercase(word_instance) padded_word_instance = pack_to_length( tokenized_text=word_instance, max_length=self.max_instance_length, pad_token=self.word_vocab.pad_token, add_start_end_token=self.word_add_start_end_token, start_token=self.word_vocab.start_token, end_token=self.word_vocab.end_token, ) tokens = self.word_numericalizer.numericalize_instance(padded_word_instance) tokens = torch.LongTensor(tokens) len_tokens = torch.LongTensor([len_instance]) character_tokens = [] # 1. For every word we get characters in the word # 2. Pad the characters to max_char_length # 3. Convert them into numbers # 4. Add them to character_tokens for word in padded_word_instance: char_instance = self.char_tokenizer.tokenize(word) padded_character_instance = pack_to_length( tokenized_text=char_instance, max_length=self.max_char_length, pad_token=" ", add_start_end_token=False, ) padded_character_tokens = self.char_numericalizer.numericalize_instance( padded_character_instance ) character_tokens.append(padded_character_tokens) character_tokens = torch.LongTensor(character_tokens) instance_dict = { "tokens": tokens, "len_tokens": len_tokens, "instance": " ".join(padded_word_instance), "raw_instance": " ".join(word_instance), "char_tokens": character_tokens, } if label is not None: assert len_instance == len(label) padded_labels = pack_to_length( tokenized_text=label, max_length=self.max_instance_length, pad_token="padding", add_start_end_token=self.word_add_start_end_token, start_token="starting", end_token="ending", ) padded_labels = [classnames2idx[label] for label in padded_labels] labels_mask = [] for class_idx in padded_labels: if idx2classname[class_idx] in self.ignored_labels: labels_mask.append(1) else: labels_mask.append(0) label = torch.LongTensor(padded_labels) labels_mask = torch.ByteTensor(labels_mask) instance_dict["label"] = label instance_dict["label_mask"] = labels_mask return instance_dict
def get_iter_dict( self, lines: Union[List[str], str], labels: Optional[Union[str, List[str]]] = None, ): if isinstance(lines, str): lines = [lines] word_instances = self.word_tokenizer.tokenize_batch(lines) len_instances = [len(instance) for instance in word_instances] classnames2idx = SectLabelDataset.get_classname2idx() padded_instances = [] for word_instance in word_instances: padded_instance = pack_to_length( tokenized_text=word_instance, max_length=self.max_instance_length, pad_token=self.word_vocab.pad_token, add_start_end_token=True, start_token=self.word_vocab.start_token, end_token=self.word_vocab.end_token, ) padded_instances.append(padded_instance) tokens = self.word_numericalizer.numericalize_batch_instances( padded_instances) tokens = torch.LongTensor(tokens) tokens = tokens.squeeze(0) instances = [] for instance in padded_instances: instances.append(" ".join(instance)) raw_instances = [] for instance in word_instances: raw_instances.append(" ".join(instance)) len_tokens = torch.LongTensor(len_instances) # squeeze the dimensions if there are more than one dimension if len(instances) == 1: instances = instances[0] raw_instances = raw_instances[0] instance_dict = { "tokens": tokens, "len_tokens": len_tokens, "instance": instances, "raw_instance": raw_instances, } if labels is not None: if isinstance(labels, str): labels = [labels] labels = [classnames2idx[label] for label in labels] label = torch.LongTensor(labels) instance_dict["label"] = label return instance_dict
def forward(self, iter_dict: Dict[str, Any]) -> torch.Tensor: """ Parameters ---------- iter_dict : Dict[str, Any] It expects "raw_intance" to be present in the iter dict. "raw_instance" is the instance that is not padded Returns ------- torch.Tensor The bert embeddings for all the words in the instances The size of the returned embedding is ``[batch_size, num_time_steps, emb_dim]`` """ # word_tokenize all the text string in the batch x = iter_dict["raw_instance"] tokenized_text = list(map(self.bert_tokenizer.tokenize, x)) lengths = list(map(lambda tokenized: len(tokenized), tokenized_text)) max_len = sorted(lengths, reverse=True)[0] # pad the tokenized text to a maximum length padded_tokenized_text = [] for tokens in tokenized_text: padded_tokens = pack_to_length( tokenized_text=tokens, max_length=max_len, pad_token="[PAD]", add_start_end_token=True, start_token="[CLS]", end_token="[SEP]", ) padded_tokenized_text.append(padded_tokens) # convert them to ids based on bert vocab indexed_tokens = list( map(self.bert_tokenizer.convert_tokens_to_ids, padded_tokenized_text)) segment_ids = list( map(lambda tokens_list: [0] * len(tokens_list), indexed_tokens)) tokens_tensor = torch.tensor(indexed_tokens) segment_tensor = torch.tensor(segment_ids) tokens_tensor = tokens_tensor.to(self.device) segment_tensor = segment_tensor.to(self.device) with torch.no_grad(): encoded_layers, _ = self.model(tokens_tensor, segment_tensor) if "base" in self.bert_type: assert len(encoded_layers) == 12 elif "large" in self.bert_type: assert len(encoded_layers) == 24 # num_bert_layers, batch_size, sequence_length, bert_hidden_dimension all_layers = torch.stack(encoded_layers, dim=0) if self.aggregation_type == "sum": sum_layers = torch.sum(all_layers, dim=0) return sum_layers elif self.aggregation_type == "average": average_layers = torch.mean(all_layers, dim=0) return average_layers