def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def forward_on_instance(self, instance: SyncedFieldsInstance) -> Dict[str, str]: """ Takes an :class:`~allennlp.data.instance.Instance`, which typically has raw text in it, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and remove the batch dimension. """ cuda_device = self._get_prediction_device() dataset = Batch([instance]) dataset.index_instances(self.vocab) gt_has_oov = False dataset_tensor_dict = dataset.as_tensor_dict() if self.OOV_ID in dataset_tensor_dict["target_tokens"]["ids_with_unks"]: gt_has_oov = True model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) output_ids = self.beam_search_decode(**model_input) output_words = [] for _id in output_ids: if _id<self.vocab_size: output_words.append(self.vocab.get_token_from_index(_id)) else: output_words.append(instance.oov_list[_id-self.vocab_size]) assert output_words[0]==START_SYMBOL, "somehow the first symbol is not the START symbol. might be a bug" output_words=output_words[1:] if output_words[-1]==END_SYMBOL: output_words = output_words[:-1] return " ".join(output_words)
def forward_on_instances(self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: model_input = {} dataset = Batch(instances) dataset.index_instances(self.vocab) if self._pointer_gen: model_input.update({'raw':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)}) #extend extend_vocab = Vocabulary.from_instances(dataset.instances) self.vocab.extend_from(extend_vocab) dataset.index_instances(self.vocab) model_input.update({'extended':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)}) else: model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) #input model_input.update({'instances':instances}) model_input.update({'predict':True}) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip(instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def collate_fn(data): if isinstance(data[0], Instance): batch = Batch(data) td = batch.as_tensor_dict() return td else: images, instances = zip(*data) images = torch.stack(images, 0) batch = Batch(instances) td = batch.as_tensor_dict() td['box_mask'] = torch.all(td['boxes'] >= 0, -1).long() td['images'] = images return td
def forward(self, tree: Tree, label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: str_phase_holder = [] self.collect_phase(tree, str_phase_holder) # tokenize and elmo tokenize instances = [self.text_to_instance(phase) for phase in str_phase_holder] idx, instances = sort_by_padding(instances, [("tokens", "num_tokens")], self.vocab) batch = Batch(instances) pad_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(pad_lengths) tensor_dict = move_to_device(tensor_dict, 0) output = self.biattentive_cell(**tensor_dict) # alert reshape the result to [length, comp, gaussian] # alert here is ugly batch_size, labels = output['weight'].size() labels = labels // self.component_num output['weight'] = output['weight'].reshape(batch_size, labels, self.component_num) output['mu'] = output['mu'].reshape(batch_size, labels, self.component_num, self.gaussian_dim) output['var'] = output['var'].reshape(batch_size, labels, self.component_num, self.gaussian_dim) # resort output result new_idx = [i for i in range(len(instances))] for pos, name in enumerate(idx): new_idx[name] = pos for name, tensor in output.items(): output[name] = torch.stack([tensor[i] for i in new_idx]) return output
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def read_squad_word_char(file_path): token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars") } reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print(len(word2idx)) print(len(char2idx)) print(char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print(padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print(tensor_dict['passage']['tokens'].shape) print(tensor_dict['passage']['chars'].shape) print(tensor_dict['question']['tokens'].shape) print(tensor_dict['question']['chars'].shape) print(tensor_dict['span_start'].shape) print(tensor_dict['span_end'].shape)
def remove_tokens(self, attentions, metadata, threshold, labels): attentions_cpu = attentions.cpu().data.numpy() sentences = [x["tokens"] for x in metadata] instances = [] for b in range(attentions_cpu.shape[0]): sentence = [x for x in sentences[b]] always_keep_mask = metadata[b]['always_keep_mask'] attn = attentions_cpu[b][:len(sentence )] + always_keep_mask * -10000 max_length = math.ceil((1 - always_keep_mask).sum() * threshold) top_ind = np.argsort(attn)[:-max_length] new_tokens = [ x for i, x in enumerate(sentence) if i in top_ind or always_keep_mask[i] == 1 ] instances += metadata[0]["convert_tokens_to_instance"](new_tokens, None) batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return { k: v.to(attentions.device) for k, v in batch["document"].items() }
def read_squad_allennlp(file_path): '''read data, build vocab, batch, padding, to idx Args: file_path -- raw squad json file Returns: None ''' token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars")} reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) for instance in instances: question = instance.fields['question'] print (question) print (type(question)) break vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print (len(word2idx)) print (len(char2idx)) print (char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print (padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print (tensor_dict['passage']['tokens'].shape) print (tensor_dict['passage']['chars'].shape) print (tensor_dict['question']['tokens'].shape) print (tensor_dict['question']['chars'].shape) print (tensor_dict['span_start'].shape) print (tensor_dict['span_end'].shape)
def collate_fn(data, to_gpu=False): """Creates mini-batch tensors """ images, instances = zip(*data) # images = torch.stack(images, 0) batch = Batch(instances) td = batch.as_tensor_dict() #for vl embedding if 'question' in td: td['question_mask'] = get_text_field_mask(td['question'], num_wrapping_dims=1) td['question_tags'][td['question_mask'] == 0] = -2 # Padding td['answer_mask'] = get_text_field_mask(td['answers'], num_wrapping_dims=1) td['answer_tags'][td['answer_mask'] == 0] = -2 td['box_mask'] = torch.all(td['boxes'] >= 0, -1).long() # td['images'] = images # Deprecated # if to_gpu: # for k in td: # if k != 'metadata': # td[k] = {k2: v.cuda(non_blocking=True) for k2, v in td[k].items()} if isinstance(td[k], dict) else td[k].cuda( # non_blocking=True) # # No nested dicts # for k in sorted(td.keys()): # if isinstance(td[k], dict): # for k2 in sorted(td[k].keys()): # td['{}_{}'.format(k, k2)] = td[k].pop(k2) # td.pop(k) return td
def data_instance_to_model_input(instance, model): dataset = Batch([instance]) dataset.index_instances(model.vocab) cuda_device = model._get_prediction_device() model_input = move_to_device(dataset.as_tensor_dict(), cuda_device=cuda_device) return model_input
def get_answer(): # Take user input and convert to Instance user_context = request.args.get("context", "", type=str) user_question = request.args.get("question", "", type=str) input_instance = squad_reader.text_to_instance( question_text=user_question, passage_text=user_context) # Make a dataset from the instance dataset = Batch([input_instance]) dataset.index_instances(train_vocab) batch = dataset.as_tensor_dict() batch = move_to_device(batch, cuda_device=0 if cuda else -1) # Extract relevant data from batch. passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] metadata = batch.get("metadata", {}) # Run data through model to get start and end logits. output_dict = model(passage, question) start_logits = output_dict["start_logits"] end_logits = output_dict["end_logits"] # Compute the best span best_span = get_best_span(start_logits, end_logits) # Get the string corresponding to the best span passage_str = metadata[0]['original_passage'] offsets = metadata[0]['token_offsets'] predicted_span = tuple(best_span[0].data.cpu().numpy()) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] best_span_string = passage_str[start_offset:end_offset] # Return the best string back to the GUI return jsonify(answer=best_span_string)
def test_forward(self): batch_dialogues = Batch(self.instances) res = self.model.forward(**batch_dialogues.as_tensor_dict( batch_dialogues.get_padding_lengths())) print(res)
def test_offsets_with_tokenized_text_base(self, transformer_name): token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" sent0 = sent0.split() sent1 = sent1.split() tokens0 = [Token(token) for token in sent0] tokens1 = [Token(token) for token in sent1] vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens0, {"transformer": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens1, {"transformer": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 每个token应该只取一个sub_word代表作为token的特征 assert len(tokens['transformer-offsets'][0]) == len(tokens0) assert len(tokens['transformer-offsets'][1]) == len(tokens1)
def test_encode_decode_with_raw_text_base(self, transformer_name): token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False) sent0 = "the quickest quick brown fox jumped over the lazy dog" sent1 = "the quick brown fox jumped over the laziest lazy elmo" vocab = Vocabulary() instance1 = Instance({ "tokens": TextField([Token(sent0)], {"transformer": token_indexer}) }) instance2 = Instance({ "tokens": TextField([Token(sent1)], {"transformer": token_indexer}) }) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] input_ids = tokens['transformer'] input_ids_0 = [id.item() for id in input_ids[0]] input_ids_1 = [id.item() for id in input_ids[1]] # 原句子应与indexer后的句子保持一致 assert sent0 == token_indexer.tokenizer.decode( input_ids_0, skip_special_tokens=True) assert sent1 == token_indexer.tokenizer.decode( input_ids_1, skip_special_tokens=True)
def _regenerate_tokens(self, metadata, sample_z): sample_z_cpu = sample_z.cpu().data.numpy() tokens = [m["tokens"] for m in metadata] assert len(tokens) == len(sample_z_cpu) assert max([len(x) for x in tokens]) == sample_z_cpu.shape[1] instances = [] new_tokens = [] for words, mask, meta in zip(tokens, sample_z_cpu, metadata): mask = mask[:len(words)] new_words = [ w for i, (w, m) in enumerate(zip(words, mask)) if i == 0 or m == 1 ] new_tokens.append(new_words) meta["new_tokens"] = new_tokens instance = metadata[0]["convert_tokens_to_instance"](new_words, None) instances += instance batch = Batch(instances) batch.index_instances(self._vocabulary) padding_lengths = batch.get_padding_lengths() batch = batch.as_tensor_dict(padding_lengths) return {k: v.to(sample_z.device) for k, v in batch["document"].items()}
def forward_on_instances( self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def test_forward_pass_runs_correctly(self): """ Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same results as the model itself. """ bidaf_ensemble = BidafEnsemble([self.model, self.model]) batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() bidaf_output_dict = self.model(**training_tensors) ensemble_output_dict = bidaf_ensemble(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics['f1'] > 0 assert torch.equal(ensemble_output_dict['best_span'], bidaf_output_dict['best_span']) assert ensemble_output_dict['best_span_str'] == bidaf_output_dict['best_span_str']
def collate_fn(data): if isinstance(data[0], dict): for index, i in enumerate(data): if "image_feat_variable" in i: i["image_feat_variable"] = ArrayTensorField( i["image_feat_variable"]) i["image_dim_variable"] = IntArrayTensorField( i["image_dim_variable"]) i["visual_embeddings_type"] = IntArrayTensorField( i["visual_embeddings_type"]) i["bert_input_ids"] = IntArrayTensorField(i["bert_input_ids"]) i["bert_input_mask"] = IntArrayTensorField( i["bert_input_mask"]) i["bert_input_type_ids"] = IntArrayTensorField( i["bert_input_type_ids"]) if "masked_lm_labels" in i: i["masked_lm_labels"] = IntArrayTensorField( i["masked_lm_labels"], padding_value=-1) if "is_random_next" in i: i["is_random_next"] = IntArrayTensorField( i["is_random_next"]) i['label'] = IntArrayTensorField(i['label']) data[index] = Instance(i) batch = Batch(data) td = batch.as_tensor_dict() td["label"] = td["label"].squeeze(-1) return td
def elmo(ll): for k in ll: sen_list = w[k] count += 1 sen_s = [] for s in sen_list: sen_s.append(s.split()) elmo = Elmo(options_filw, weight_file, 1) instances = [] indexer = ELMoTokenCharactersIndexer() for sen in sen_s: tokens = [Token(token) for token in sen] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) voca = Vocabulary() dataset.index_instances(voca) dic = {'elmo': {'num_tokens': 15}} character_ids = dataset.as_tensor_dict(dic)['elmo']['character_ids'] character_ids = character_ids sth = elmo(character_ids)['elmo_representations'] sth = list(torch.chunk(result, result.shape[0], 0)) re[k] = sth
def predict(instances: List[Instance]) -> List[float]: """Output BERT NSP next sentence probability for a list of instances. Parameters ---------- instances : List[Instance] Returns ------- List[float] BERT NSP scores in range [0, 1]. """ scores = [] for batch_instance in tqdm(batch(instances, batch_size=args.batch_size), total=math.ceil( len(instances) / args.batch_size), desc='Predicting'): batch_ins = Batch(batch_instance) batch_ins.index_instances(VOCAB) tensor_dict = batch_ins.as_tensor_dict(batch_ins.get_padding_lengths()) tokens = tensor_dict["tokens"] input_ids = tokens['bert'].to(torch.device(f'cuda:{GPU_ID}')) token_type_ids = tokens['bert-type-ids'].to( torch.device(f'cuda:{GPU_ID}')) input_mask = (input_ids != 0).long() cls_out = BERT_NEXT_SENTENCE.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask) probs = F.softmax(cls_out, dim=-1) next_sentence_score = probs[:, 0].detach().cpu().numpy().tolist() scores += next_sentence_score return scores
def instances_to_batch(instances, model, for_training, cuda_device=0): batch = Batch(instances) batch.index_instances(model.vocab) padding_lengths = batch.get_padding_lengths() return batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training)
def test_forward_pass_runs_correctly(self): u""" Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same results as the model itself. """ bidaf_ensemble = BidafEnsemble([self.model, self.model]) batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() bidaf_output_dict = self.model(**training_tensors) ensemble_output_dict = bidaf_ensemble(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics[u'f1'] > 0 assert torch.equal(ensemble_output_dict[u'best_span'], bidaf_output_dict[u'best_span']) assert ensemble_output_dict[u'best_span_str'] == bidaf_output_dict[ u'best_span_str']
def test_forward_pass_runs_correctly(self): batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() output_dict = self.model(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics["per_instance_f1"] > 0 span_start_probs = output_dict["span_start_probs"][0].data.numpy() span_end_probs = output_dict["span_start_probs"][0].data.numpy() assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6) assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6) span_start, span_end = tuple(output_dict["best_span"][0].data.numpy()) assert span_start >= 0 assert span_start <= span_end assert span_end < self.instances[0].fields[ "question_with_context"].sequence_length() assert isinstance(output_dict["best_span_str"][0], str)
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17] ] assert tokens["bert-offsets"].tolist() == [ [1, 2, 3, 4, 5, 6, 7, 8, 9] ]
def batch_to_ids(self, stories_tokenized: List[List[str]]): """ Simple wrapper around _elmo_batch_to_ids :param batch: A list of tokenized sentences. :return: A tensor of padded character ids. """ batch = Batch([ Instance({ 'story': TextField([Token('@@bos@@')] + [Token(x) for x in story] + [Token('@@eos@@')], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True) }) }) for story in stories_tokenized ]) batch.index_instances(self.vocab) words = { k: v['tokens'] for k, v in batch.as_tensor_dict( for_training=self.training).items() }['story'].cuda(async=True) return words
def my_collate(batch, vocab): questions = Batch([x[0] for x in batch]) questions.index_instances(vocab) rest = [x[1:] for x in batch] question_batch = questions.as_tensor_dict()["question"]["tokens"] image_batch, answer_batch = default_collate(rest) return [(question_batch, image_batch), answer_batch]
def test_read(self, lazy): reader = GLUESST2DatasetReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={'bert': PretrainedBertIndexer( pretrained_model=self.BERT_VOCAB_PATH)}, skip_label_indexing=False ) instances = reader.read( str(self.FIXTURES_ROOT / 'dev.tsv')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens']] label = example.fields['label'].label print(label) print(tokens) batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0]]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def _strings_to_batch(self, source_tokens: List[List[str]], target_tokens: Dict[str, torch.Tensor], target_golden: Dict[str, torch.Tensor], lang_pair: str): """ Converts list of sentences which are itself lists of strings into Batch suitable for passing into model's forward function. TODO: Make sure the right device (CPU/GPU) is used. Predicted tokens might get copied on CPU in `self.decode` method... """ # convert source tokens into source tensor_dict instances = [] lang_pairs = [] for sentence in source_tokens: sentence = " ".join(sentence) instances.append(self._reader.string_to_instance(sentence)) lang_pairs.append(lang_pair) source_batch = Batch(instances) source_batch.index_instances(self.vocab) source_batch = source_batch.as_tensor_dict() model_input = { "source_tokens": source_batch["tokens"], "target_golden": target_golden, "target_tokens": target_tokens, "lang_pair": lang_pairs } return model_input
def forward_on_instances(self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip(instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def test_sliding_window_with_batch(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [2, 3, 5, 6, 8, 9, 2, 14, 12] ] assert tokens["bert-offsets"].tolist() == [ [0, 1, 2, 3, 4, 5, 6, 7, 8] ]
def forward_on_instances( self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: model_input = {} dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), 0) #input #model_input.update({'instances':instances}) model_input.update({'predict': True}) # del model_input["source_tokens_raw"] # del model_input["source_tokens"] # del model_input["instances"] outputs = self.decode(self(**model_input)) #print(outputs) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def batch_to_ids(batch ) : u""" Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {u'character_ids': indexer}) instance = Instance({u"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
def ensure_batch_predictions_are_consistent( self, keys_to_ignore: Iterable[str] = ()): """ Ensures that the model performs the same on a batch of instances as on individual instances. Ignores metrics matching the regexp .*loss.* and those specified explicitly. Parameters ---------- keys_to_ignore : ``Iterable[str]``, optional (default=()) Names of metrics that should not be taken into account, e.g. "batch_weight". """ self.model.eval() single_predictions = [] for i, instance in enumerate(self.instances): dataset = Batch([instance]) tensors = dataset.as_tensor_dict(dataset.get_padding_lengths()) result = self.model(**tensors) single_predictions.append(result) full_dataset = Batch(self.instances) batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths()) batch_predictions = self.model(**batch_tensors) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if 'loss' in key: # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue if key in keys_to_ignore: continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.Tensor): if single_predicted.size() != batch_predicted.size(): slices = tuple(slice(0, size) for size in single_predicted.size()) batch_predicted = batch_predicted[slices] assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) training_tensors = dataset.as_tensor_dict() output_dict = self.model(**training_tensors) tags = output_dict['tags'] assert len(tags) == 2 assert len(tags[0]) == 7 assert len(tags[1]) == 7 for example_tags in tags: for tag_id in example_tags: tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels") assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
def test_squad_with_unwordpieceable_passage(self): # pylint: disable=line-too-long tokenizer = WordTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:") question1 = "Who released A Study of High Definition Television Systems?" passage2 = ("Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1) instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def forward_on_instances(self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. Parameters ---------- instances : List[Instance], required The instances to run the model on. cuda_device : int, required The GPU device to use. -1 means use the CPU. Returns ------- A list of the models output for each instance. """ batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. # This occurs with batch size 1, because we still want to include the loss in that case. if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue outputs[name] = output for instance_output, batch_element in zip(instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0], [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] ] assert tokens["bert-offsets"].tolist() == [ [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], [0, 1, 2, 3, 4, 5, 6, 9, 10, 11] ] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer, 'tokens': indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
def test_forward_pass_runs_correctly(self): batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() output_dict = self.model(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics['f1'] > 0 span_start_probs = output_dict['span_start_probs'][0].data.numpy() span_end_probs = output_dict['span_start_probs'][0].data.numpy() assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6) assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6) span_start, span_end = tuple(output_dict['best_span'][0].data.numpy()) assert span_start >= 0 assert span_start <= span_end assert span_end < self.instances[0].fields['passage'].sequence_length() assert isinstance(output_dict['best_span_str'][0], str)
char_spans = char_spans) print ("Keys instance: ", instance.fields.keys()) # Batch intances and convert to index using the vocabulary. instances = [instance] else: instances = [train_dataset[0],train_dataset[1]] ## Create the batch ready to be used dataset = Batch(instances) dataset.index_instances(vocab) print ("-------------- DATASET EXAMPLE ---------------") character_ids_passage = dataset.as_tensor_dict()['passage']['character_ids'] character_ids_question = dataset.as_tensor_dict()['question']['character_ids'] question = dataset.as_tensor_dict()['question'] passage = dataset.as_tensor_dict()['passage'] span_start = dataset.as_tensor_dict()['span_start'] span_end = dataset.as_tensor_dict()['span_end'] metadata = dataset.as_tensor_dict()['metadata'] print ("Shape of characters ids passage: ", character_ids_passage.shape) print ("Shape of characters ids question: ", character_ids_question.shape) print ("Batch size: ", character_ids_passage.shape[0]) print ("Maximum num words in batch: ", character_ids_passage.shape[1]) print ("Maximum word length in dictionary: ", character_ids_passage.shape[2])
def _get_training_tensors(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) return dataset.as_tensor_dict()
if (create_video_training): pf.create_image_weights_epoch(model, video_fotograms_folder2, i) pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset, tr_data_loss, val_data_loss, KL_loss, video_fotograms_folder4, i+1) # output = model(tensor_dict["text_field"],tensor_dict["tags_field"]) # loss = output["loss"] # We can get the loss coz we gave the labels as input # gradient and everything. """ ############## Use the trained model ###################### We use an already implemented predictor that takes the model and how to preprocess the data """ name_exmaple = "Eat my motherfucking jeans" name_exmaple = "Carlos Sanchez" tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))] Instance_test = reader.generate_instance(tokens_list,None) batch = Batch([Instance_test]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) model.eval() tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy() tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])
############ Propagate an instance text ############# """ instance = dataset_reader.text_to_instance("What kind of test succeeded on its first attempt?", "One time I was writing a unit test, and it succeeded on the first attempt.", char_spans=[(6, 10)]) print ("Keys instance: ", instance.fields.keys()) # Batch intances and convert to index using the vocabulary. instances = [instance] dataset = Batch(instances) dataset.index_instances(model.vocab) # Create the index tensor from the vocabulary. cuda_device = model._get_prediction_device() model_input = dataset.as_tensor_dict(cuda_device=cuda_device) # Propagate the sample and obtain the loss (since we passed labels) outputs = model(**model_input) outputs["loss"].requires_grad
## Create an empty vocabulary ! We do not need to create one from dataset, # It will use all of the indexer !! vocab = Vocabulary() ## Create the index_instances from the batch, this will be used later by ELMO dataset.index_instances(vocab) """ IMPORTANT: The ELMO uses just a character vocab in the interface. It will compute the rest internally! The ELMO words are padded to length 50 ! """ character_ids = dataset.as_tensor_dict()['elmo']['character_ids'] print ("Shape of characters ids: ", character_ids.shape) print ("Batch size: ", character_ids.shape[0]) print ("Maximum num words in batch: ", character_ids.shape[1]) print ("Maximum word length in dictionary: ", character_ids.shape[2]) #character_ids = batch_to_ids(sentences) """ Compute the Embeddings from the """ embeddings = elmo(character_ids) layer_1_values = embeddings["elmo_representations"][0] layer_2_values = embeddings["elmo_representations"][1] print ("Layer 1 representations: ", layer_1_values.shape) print ("Layer 2 representations: ", layer_2_values.shape)