def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.instances):
         dataset = Batch([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Batch(self.instances)
     batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False)
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     slices = tuple(slice(0, size) for size in single_predicted.size())
                     batch_predicted = batch_predicted[slices]
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Beispiel #2
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.instances):
         dataset = Batch([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Batch(self.instances)
     batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.Tensor):
                 if single_predicted.size() != batch_predicted.size():
                     slices = tuple(slice(0, size) for size in single_predicted.size())
                     batch_predicted = batch_predicted[slices]
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Beispiel #3
0
    def forward_on_instance(self, instance: SyncedFieldsInstance) -> Dict[str, str]:
        """
        Takes an :class:`~allennlp.data.instance.Instance`, which typically has raw text in it,
        converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays
        through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and remove the batch dimension.
        """
        cuda_device = self._get_prediction_device()
        dataset = Batch([instance])
        dataset.index_instances(self.vocab)

        gt_has_oov = False
        dataset_tensor_dict = dataset.as_tensor_dict()
        if self.OOV_ID in dataset_tensor_dict["target_tokens"]["ids_with_unks"]:
            gt_has_oov = True

        model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
        output_ids = self.beam_search_decode(**model_input)

        output_words = []
        for _id in output_ids:
            if _id<self.vocab_size:
                output_words.append(self.vocab.get_token_from_index(_id))
            else:
                output_words.append(instance.oov_list[_id-self.vocab_size])

        assert output_words[0]==START_SYMBOL, "somehow the first symbol is not the START symbol. might be a bug"
        output_words=output_words[1:]
        if output_words[-1]==END_SYMBOL:
            output_words = output_words[:-1]
        return " ".join(output_words)
    def forward_on_instances(self,
                             instances: List[Instance],
                             cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        model_input = {}
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        if self._pointer_gen:
            model_input.update({'raw':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)})
            #extend
            extend_vocab = Vocabulary.from_instances(dataset.instances)
            self.vocab.extend_from(extend_vocab)
            dataset.index_instances(self.vocab)
            model_input.update({'extended':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)})
        else:
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        #input
        model_input.update({'instances':instances})
        model_input.update({'predict':True})
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Beispiel #5
0
    def collate_fn(data):
        if isinstance(data[0], Instance):
            batch = Batch(data)
            td = batch.as_tensor_dict()
            return td
        else:
            images, instances = zip(*data)
            images = torch.stack(images, 0)

            batch = Batch(instances)
            td = batch.as_tensor_dict()
            td['box_mask'] = torch.all(td['boxes'] >= 0, -1).long()
            td['images'] = images
            return td
 def forward(self, tree: Tree,
             label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
     str_phase_holder = []
     self.collect_phase(tree, str_phase_holder)
     # tokenize and elmo tokenize
     instances = [self.text_to_instance(phase) for phase in str_phase_holder]
     idx, instances = sort_by_padding(instances, [("tokens", "num_tokens")], self.vocab)
     batch = Batch(instances)
     pad_lengths = batch.get_padding_lengths()
     tensor_dict = batch.as_tensor_dict(pad_lengths)
     tensor_dict = move_to_device(tensor_dict, 0)
     output = self.biattentive_cell(**tensor_dict)
     # alert reshape the result to [length, comp, gaussian]
     # alert here is ugly
     batch_size, labels = output['weight'].size()
     labels = labels // self.component_num
     output['weight'] = output['weight'].reshape(batch_size, labels, self.component_num)
     output['mu'] = output['mu'].reshape(batch_size, labels, self.component_num, self.gaussian_dim)
     output['var'] = output['var'].reshape(batch_size, labels, self.component_num, self.gaussian_dim)
     # resort output result
     new_idx = [i for i in range(len(instances))]
     for pos, name in enumerate(idx):
         new_idx[name] = pos
     for name, tensor in output.items():
         output[name] = torch.stack([tensor[i] for i in new_idx])
     return output
Beispiel #7
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Beispiel #8
0
def read_squad_word_char(file_path):
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace="token_ids"),
        "chars": TokenCharactersIndexer(namespace="token_chars")
    }
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print(len(word2idx))
    print(len(char2idx))
    print(char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print(padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print(tensor_dict['passage']['tokens'].shape)
    print(tensor_dict['passage']['chars'].shape)
    print(tensor_dict['question']['tokens'].shape)
    print(tensor_dict['question']['chars'].shape)
    print(tensor_dict['span_start'].shape)
    print(tensor_dict['span_end'].shape)
Beispiel #9
0
    def remove_tokens(self, attentions, metadata, threshold, labels):
        attentions_cpu = attentions.cpu().data.numpy()
        sentences = [x["tokens"] for x in metadata]
        instances = []
        for b in range(attentions_cpu.shape[0]):
            sentence = [x for x in sentences[b]]
            always_keep_mask = metadata[b]['always_keep_mask']
            attn = attentions_cpu[b][:len(sentence
                                          )] + always_keep_mask * -10000
            max_length = math.ceil((1 - always_keep_mask).sum() * threshold)

            top_ind = np.argsort(attn)[:-max_length]
            new_tokens = [
                x for i, x in enumerate(sentence)
                if i in top_ind or always_keep_mask[i] == 1
            ]
            instances += metadata[0]["convert_tokens_to_instance"](new_tokens,
                                                                   None)

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {
            k: v.to(attentions.device)
            for k, v in batch["document"].items()
        }
Beispiel #10
0
def read_squad_allennlp(file_path):
    '''read data, build vocab, batch, padding, to idx
    Args:
        file_path -- raw squad json file
    Returns:
        None
    '''
    token_indexers = {
            "tokens": SingleIdTokenIndexer(namespace="token_ids"),
            "chars": TokenCharactersIndexer(namespace="token_chars")}
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    for instance in instances:
        question = instance.fields['question']
        print (question)
        print (type(question))
        break
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print (len(word2idx))
    print (len(char2idx))
    print (char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print (padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print (tensor_dict['passage']['tokens'].shape)
    print (tensor_dict['passage']['chars'].shape)
    print (tensor_dict['question']['tokens'].shape)
    print (tensor_dict['question']['chars'].shape)
    print (tensor_dict['span_start'].shape)
    print (tensor_dict['span_end'].shape)
Beispiel #11
0
def collate_fn(data, to_gpu=False):
    """Creates mini-batch tensors
    """
    images, instances = zip(*data)
    # images = torch.stack(images, 0)
    batch = Batch(instances)
    td = batch.as_tensor_dict()

    #for vl embedding
    if 'question' in td:
        td['question_mask'] = get_text_field_mask(td['question'],
                                                  num_wrapping_dims=1)
        td['question_tags'][td['question_mask'] == 0] = -2  # Padding

    td['answer_mask'] = get_text_field_mask(td['answers'], num_wrapping_dims=1)
    td['answer_tags'][td['answer_mask'] == 0] = -2

    td['box_mask'] = torch.all(td['boxes'] >= 0, -1).long()
    # td['images'] = images

    # Deprecated
    # if to_gpu:
    #     for k in td:
    #         if k != 'metadata':
    #             td[k] = {k2: v.cuda(non_blocking=True) for k2, v in td[k].items()} if isinstance(td[k], dict) else td[k].cuda(
    #             non_blocking=True)

    # # No nested dicts
    # for k in sorted(td.keys()):
    #     if isinstance(td[k], dict):
    #         for k2 in sorted(td[k].keys()):
    #             td['{}_{}'.format(k, k2)] = td[k].pop(k2)
    #         td.pop(k)

    return td
Beispiel #12
0
def data_instance_to_model_input(instance, model):
    dataset = Batch([instance])
    dataset.index_instances(model.vocab)
    cuda_device = model._get_prediction_device()
    model_input = move_to_device(dataset.as_tensor_dict(),
                                 cuda_device=cuda_device)
    return model_input
Beispiel #13
0
    def get_answer():
        # Take user input and convert to Instance
        user_context = request.args.get("context", "", type=str)
        user_question = request.args.get("question", "", type=str)
        input_instance = squad_reader.text_to_instance(
            question_text=user_question, passage_text=user_context)
        # Make a dataset from the instance
        dataset = Batch([input_instance])
        dataset.index_instances(train_vocab)
        batch = dataset.as_tensor_dict()
        batch = move_to_device(batch, cuda_device=0 if cuda else -1)
        # Extract relevant data from batch.
        passage = batch["passage"]["tokens"]
        question = batch["question"]["tokens"]
        metadata = batch.get("metadata", {})

        # Run data through model to get start and end logits.
        output_dict = model(passage, question)
        start_logits = output_dict["start_logits"]
        end_logits = output_dict["end_logits"]

        # Compute the best span
        best_span = get_best_span(start_logits, end_logits)

        # Get the string corresponding to the best span
        passage_str = metadata[0]['original_passage']
        offsets = metadata[0]['token_offsets']
        predicted_span = tuple(best_span[0].data.cpu().numpy())
        start_offset = offsets[predicted_span[0]][0]
        end_offset = offsets[predicted_span[1]][1]
        best_span_string = passage_str[start_offset:end_offset]

        # Return the best string back to the GUI
        return jsonify(answer=best_span_string)
    def test_forward(self):
        batch_dialogues = Batch(self.instances)

        res = self.model.forward(**batch_dialogues.as_tensor_dict(
            batch_dialogues.get_padding_lengths()))

        print(res)
Beispiel #15
0
    def test_offsets_with_tokenized_text_base(self, transformer_name):
        token_indexer = TransformerIndexer(model_name=transformer_name,
                                           do_lowercase=False)
        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"

        sent0 = sent0.split()
        sent1 = sent1.split()

        tokens0 = [Token(token) for token in sent0]
        tokens1 = [Token(token) for token in sent1]

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens0, {"transformer": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens1, {"transformer": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 每个token应该只取一个sub_word代表作为token的特征
        assert len(tokens['transformer-offsets'][0]) == len(tokens0)
        assert len(tokens['transformer-offsets'][1]) == len(tokens1)
Beispiel #16
0
    def test_encode_decode_with_raw_text_base(self, transformer_name):
        token_indexer = TransformerIndexer(model_name=transformer_name,
                                           do_lowercase=False)
        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"

        vocab = Vocabulary()

        instance1 = Instance({
            "tokens":
            TextField([Token(sent0)], {"transformer": token_indexer})
        })
        instance2 = Instance({
            "tokens":
            TextField([Token(sent1)], {"transformer": token_indexer})
        })

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        input_ids = tokens['transformer']
        input_ids_0 = [id.item() for id in input_ids[0]]
        input_ids_1 = [id.item() for id in input_ids[1]]
        # 原句子应与indexer后的句子保持一致
        assert sent0 == token_indexer.tokenizer.decode(
            input_ids_0, skip_special_tokens=True)
        assert sent1 == token_indexer.tokenizer.decode(
            input_ids_1, skip_special_tokens=True)
Beispiel #17
0
    def _regenerate_tokens(self, metadata, sample_z):
        sample_z_cpu = sample_z.cpu().data.numpy()
        tokens = [m["tokens"] for m in metadata]

        assert len(tokens) == len(sample_z_cpu)
        assert max([len(x) for x in tokens]) == sample_z_cpu.shape[1]

        instances = []
        new_tokens = []
        for words, mask, meta in zip(tokens, sample_z_cpu, metadata):
            mask = mask[:len(words)]
            new_words = [
                w for i, (w, m) in enumerate(zip(words, mask))
                if i == 0 or m == 1
            ]

            new_tokens.append(new_words)
            meta["new_tokens"] = new_tokens
            instance = metadata[0]["convert_tokens_to_instance"](new_words,
                                                                 None)
            instances += instance

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {k: v.to(sample_z.device) for k, v in batch["document"].items()}
Beispiel #18
0
    def forward_on_instances(
            self, instances: List[Instance],
            cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        model_input = dataset.as_tensor_dict(cuda_device=cuda_device,
                                             for_training=False)
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [
            {} for _ in dataset.instances
        ]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(
                    instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
    def test_forward_pass_runs_correctly(self):
        """
        Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same
        results as the model itself.
        """
        bidaf_ensemble = BidafEnsemble([self.model, self.model])

        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()

        bidaf_output_dict = self.model(**training_tensors)
        ensemble_output_dict = bidaf_ensemble(**training_tensors)

        metrics = self.model.get_metrics(reset=True)

        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics['f1'] > 0
        assert torch.equal(ensemble_output_dict['best_span'], bidaf_output_dict['best_span'])
        assert ensemble_output_dict['best_span_str'] == bidaf_output_dict['best_span_str']
Beispiel #20
0
    def collate_fn(data):
        if isinstance(data[0], dict):
            for index, i in enumerate(data):
                if "image_feat_variable" in i:
                    i["image_feat_variable"] = ArrayTensorField(
                        i["image_feat_variable"])
                    i["image_dim_variable"] = IntArrayTensorField(
                        i["image_dim_variable"])
                    i["visual_embeddings_type"] = IntArrayTensorField(
                        i["visual_embeddings_type"])

                i["bert_input_ids"] = IntArrayTensorField(i["bert_input_ids"])
                i["bert_input_mask"] = IntArrayTensorField(
                    i["bert_input_mask"])
                i["bert_input_type_ids"] = IntArrayTensorField(
                    i["bert_input_type_ids"])

                if "masked_lm_labels" in i:
                    i["masked_lm_labels"] = IntArrayTensorField(
                        i["masked_lm_labels"], padding_value=-1)
                if "is_random_next" in i:
                    i["is_random_next"] = IntArrayTensorField(
                        i["is_random_next"])
                i['label'] = IntArrayTensorField(i['label'])

                data[index] = Instance(i)
        batch = Batch(data)
        td = batch.as_tensor_dict()
        td["label"] = td["label"].squeeze(-1)
        return td
Beispiel #21
0
def elmo(ll):
    for k in ll:
        sen_list = w[k]
        count += 1
        sen_s = []
        for s in sen_list:
            sen_s.append(s.split())
        elmo = Elmo(options_filw, weight_file, 1)
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        for sen in sen_s:
            tokens = [Token(token) for token in sen]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)
        dataset = Batch(instances)
        voca = Vocabulary()
        dataset.index_instances(voca)

        dic = {'elmo': {'num_tokens': 15}}
        character_ids = dataset.as_tensor_dict(dic)['elmo']['character_ids']
        character_ids = character_ids
        sth = elmo(character_ids)['elmo_representations']
        sth = list(torch.chunk(result, result.shape[0], 0))
        re[k] = sth
Beispiel #22
0
def predict(instances: List[Instance]) -> List[float]:
    """Output BERT NSP next sentence probability for a list of instances.

    Parameters
    ----------
    instances : List[Instance]

    Returns
    -------
    List[float]
        BERT NSP scores in range [0, 1].
    """
    scores = []
    for batch_instance in tqdm(batch(instances, batch_size=args.batch_size),
                               total=math.ceil(
                                   len(instances) / args.batch_size),
                               desc='Predicting'):
        batch_ins = Batch(batch_instance)
        batch_ins.index_instances(VOCAB)
        tensor_dict = batch_ins.as_tensor_dict(batch_ins.get_padding_lengths())
        tokens = tensor_dict["tokens"]
        input_ids = tokens['bert'].to(torch.device(f'cuda:{GPU_ID}'))
        token_type_ids = tokens['bert-type-ids'].to(
            torch.device(f'cuda:{GPU_ID}'))
        input_mask = (input_ids != 0).long()
        cls_out = BERT_NEXT_SENTENCE.forward(input_ids=input_ids,
                                             token_type_ids=token_type_ids,
                                             attention_mask=input_mask)
        probs = F.softmax(cls_out, dim=-1)
        next_sentence_score = probs[:, 0].detach().cpu().numpy().tolist()
        scores += next_sentence_score

    return scores
Beispiel #23
0
def instances_to_batch(instances, model, for_training, cuda_device=0):
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    padding_lengths = batch.get_padding_lengths()
    return batch.as_tensor_dict(padding_lengths,
                                cuda_device=cuda_device,
                                for_training=for_training)
    def test_forward_pass_runs_correctly(self):
        u"""
        Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same
        results as the model itself.
        """
        bidaf_ensemble = BidafEnsemble([self.model, self.model])

        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()

        bidaf_output_dict = self.model(**training_tensors)
        ensemble_output_dict = bidaf_ensemble(**training_tensors)

        metrics = self.model.get_metrics(reset=True)

        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics[u'f1'] > 0
        assert torch.equal(ensemble_output_dict[u'best_span'],
                           bidaf_output_dict[u'best_span'])
        assert ensemble_output_dict[u'best_span_str'] == bidaf_output_dict[
            u'best_span_str']
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        metrics = self.model.get_metrics(reset=True)
        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics["per_instance_f1"] > 0

        span_start_probs = output_dict["span_start_probs"][0].data.numpy()
        span_end_probs = output_dict["span_start_probs"][0].data.numpy()
        assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6)
        assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6)
        span_start, span_end = tuple(output_dict["best_span"][0].data.numpy())
        assert span_start >= 0
        assert span_start <= span_end
        assert span_end < self.instances[0].fields[
            "question_with_context"].sequence_length()
        assert isinstance(output_dict["best_span_str"][0], str)
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [1, 2, 3, 4, 5, 6, 7, 8, 9]
        ]
Beispiel #27
0
 def batch_to_ids(self, stories_tokenized: List[List[str]]):
     """
     Simple wrapper around _elmo_batch_to_ids
     :param batch: A list of tokenized sentences.
     :return: A tensor of padded character ids.
     """
     batch = Batch([
         Instance({
             'story':
             TextField([Token('@@bos@@')] + [Token(x) for x in story] +
                       [Token('@@eos@@')],
                       token_indexers={
                           'tokens':
                           SingleIdTokenIndexer(namespace='tokens',
                                                lowercase_tokens=True)
                       })
         }) for story in stories_tokenized
     ])
     batch.index_instances(self.vocab)
     words = {
         k: v['tokens']
         for k, v in batch.as_tensor_dict(
             for_training=self.training).items()
     }['story'].cuda(async=True)
     return words
Beispiel #28
0
def my_collate(batch, vocab):
    questions = Batch([x[0] for x in batch])
    questions.index_instances(vocab)
    rest = [x[1:] for x in batch]
    question_batch = questions.as_tensor_dict()["question"]["tokens"]
    image_batch, answer_batch = default_collate(rest)
    return [(question_batch, image_batch), answer_batch]
Beispiel #29
0
    def test_read(self, lazy):
        reader = GLUESST2DatasetReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={'bert': PretrainedBertIndexer(
                pretrained_model=self.BERT_VOCAB_PATH)},
            skip_label_indexing=False
        )
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'dev.tsv'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens']]
        label = example.fields['label'].label
        print(label)
        print(tokens)
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)
        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([vocab.get_token_from_index(i, "bert")
               for i in tokens["bert"].tolist()[0]])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
Beispiel #30
0
    def _strings_to_batch(self, source_tokens: List[List[str]],
                          target_tokens: Dict[str, torch.Tensor],
                          target_golden: Dict[str,
                                              torch.Tensor], lang_pair: str):
        """
        Converts list of sentences which are itself lists of strings into Batch
        suitable for passing into model's forward function.

        TODO: Make sure the right device (CPU/GPU) is used. Predicted tokens might get copied on
        CPU in `self.decode` method...
        """
        # convert source tokens into source tensor_dict
        instances = []
        lang_pairs = []
        for sentence in source_tokens:
            sentence = " ".join(sentence)
            instances.append(self._reader.string_to_instance(sentence))
            lang_pairs.append(lang_pair)

        source_batch = Batch(instances)
        source_batch.index_instances(self.vocab)
        source_batch = source_batch.as_tensor_dict()
        model_input = {
            "source_tokens": source_batch["tokens"],
            "target_golden": target_golden,
            "target_tokens": target_tokens,
            "lang_pair": lang_pairs
        }

        return model_input
Beispiel #31
0
    def forward_on_instances(self,
                             instances: List[Instance],
                             cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Beispiel #32
0
    def test_sliding_window_with_batch(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8)

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})})

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 5, 6, 8, 9, 2, 14, 12]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 1, 2, 3, 4, 5, 6, 7, 8]
        ]
Beispiel #34
0
    def forward_on_instances(
            self, instances: List[Instance],
            cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        model_input = {}
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        model_input = util.move_to_device(dataset.as_tensor_dict(), 0)
        #input
        #model_input.update({'instances':instances})
        model_input.update({'predict': True})
        # del model_input["source_tokens_raw"]
        # del model_input["source_tokens"]
        # del model_input["instances"]
        outputs = self.decode(self(**model_input))
        #print(outputs)

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [
            {} for _ in dataset.instances
        ]
        for name, output in list(outputs.items()):
            outputs[name] = output
            for instance_output, batch_element in zip(
                    instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Beispiel #35
0
def batch_to_ids(batch                 )                :
    u"""
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {u'character_ids': indexer})
        instance = Instance({u"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
Beispiel #36
0
    def ensure_batch_predictions_are_consistent(
            self,
            keys_to_ignore: Iterable[str] = ()):
        """
        Ensures that the model performs the same on a batch of instances as on individual instances.
        Ignores metrics matching the regexp .*loss.* and those specified explicitly.

        Parameters
        ----------
        keys_to_ignore : ``Iterable[str]``, optional (default=())
            Names of metrics that should not be taken into account, e.g. "batch_weight".
        """
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if 'loss' in key:
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                if key in keys_to_ignore:
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.Tensor):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Beispiel #37
0
    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     tags = output_dict['tags']
     assert len(tags) == 2
     assert len(tags[0]) == 7
     assert len(tags[1]) == 7
     for example_tags in tags:
         for tag_id in example_tags:
             tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels")
             assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
    def test_squad_with_unwordpieceable_passage(self):
        # pylint: disable=line-too-long
        tokenizer = WordTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, "
                    "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:")
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = ("Broca, being what today would be called a neurosurgeon, "
                    "had taken an interest in the pathology of speech. He wanted "
                    "to localize the difference between man and the other animals, "
                    "which appeared to reside in speech. He discovered the speech "
                    "center of the human brain, today called Broca's area after him. "
                    "His interest was mainly in Biological anthropology, but a German "
                    "philosopher specializing in psychology, Theodor Waitz, took up the "
                    "theme of general and social anthropology in his six-volume work, "
                    "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
                    """soon translated as "The Anthropology of Primitive Peoples". """
                    "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance

        instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1),
                                                        tokenizer.tokenize(passage1),
                                                        {"bert": token_indexer},
                                                        passage1)

        instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2),
                                                        tokenizer.tokenize(passage2),
                                                        {"bert": token_indexer},
                                                        passage2)

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
Beispiel #40
0
    def forward_on_instances(self,
                             instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.
        cuda_device : int, required
            The GPU device to use.  -1 means use the CPU.

        Returns
        -------
        A list of the models output for each instance.
        """
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                outputs[name] = output
                for instance_output, batch_element in zip(instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Beispiel #41
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0],
                [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                [0, 1, 2, 3, 4, 5, 6, 9, 10, 11]
        ]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Beispiel #43
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
Beispiel #45
0
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        metrics = self.model.get_metrics(reset=True)
        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics['f1'] > 0

        span_start_probs = output_dict['span_start_probs'][0].data.numpy()
        span_end_probs = output_dict['span_start_probs'][0].data.numpy()
        assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6)
        assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6)
        span_start, span_end = tuple(output_dict['best_span'][0].data.numpy())
        assert span_start >= 0
        assert span_start <= span_end
        assert span_end < self.instances[0].fields['passage'].sequence_length()
        assert isinstance(output_dict['best_span_str'][0], str)
                                               char_spans = char_spans)
    
    print ("Keys instance: ", instance.fields.keys())
    
    # Batch intances and convert to index using the vocabulary.
    instances = [instance]
else:

    instances = [train_dataset[0],train_dataset[1]]

## Create the batch ready to be used
dataset = Batch(instances)
dataset.index_instances(vocab)

print ("-------------- DATASET EXAMPLE ---------------")
character_ids_passage = dataset.as_tensor_dict()['passage']['character_ids']
character_ids_question = dataset.as_tensor_dict()['question']['character_ids']

question =  dataset.as_tensor_dict()['question']
passage =  dataset.as_tensor_dict()['passage']
span_start =  dataset.as_tensor_dict()['span_start']
span_end =  dataset.as_tensor_dict()['span_end']
metadata =  dataset.as_tensor_dict()['metadata']

print ("Shape of characters ids passage: ", character_ids_passage.shape)
print ("Shape of characters ids question: ", character_ids_question.shape)

print ("Batch size: ", character_ids_passage.shape[0])
print ("Maximum num words in batch: ", character_ids_passage.shape[1])
print ("Maximum word length in dictionary: ", character_ids_passage.shape[2])
 def _get_training_tensors(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     return dataset.as_tensor_dict()
    if (create_video_training):
        pf.create_image_weights_epoch(model, video_fotograms_folder2, i)
        pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset,
                                            tr_data_loss, val_data_loss, KL_loss,
                                            video_fotograms_folder4, i+1)

#            output = model(tensor_dict["text_field"],tensor_dict["tags_field"])
#            loss = output["loss"] # We can get the loss coz we gave the labels as input



			# gradient and everything. 
"""
############## Use the trained model ######################
We use an already implemented predictor that takes the model and how to preprocess the data
"""

name_exmaple = "Eat my motherfucking jeans"
name_exmaple = "Carlos Sanchez"
tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))]
Instance_test = reader.generate_instance(tokens_list,None)
batch = Batch([Instance_test])
batch.index_instances(vocab)

padding_lengths = batch.get_padding_lengths()
tensor_dict = batch.as_tensor_dict(padding_lengths)

model.eval()
tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy()
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])
    ############  Propagate an instance text #############
    """
    instance = dataset_reader.text_to_instance("What kind of test succeeded on its first attempt?", 
                                               "One time I was writing a unit test, and it succeeded on the first attempt.", 
                                               char_spans=[(6, 10)])
    
    print ("Keys instance: ", instance.fields.keys())
    
    # Batch intances and convert to index using the vocabulary.
    instances = [instance]
    dataset = Batch(instances)
    dataset.index_instances(model.vocab)
    
    # Create the index tensor from the vocabulary.
    cuda_device = model._get_prediction_device()
    model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
    
    # Propagate the sample and obtain the loss (since we passed labels)
    outputs = model(**model_input)
    outputs["loss"].requires_grad










Beispiel #50
0
## Create an empty vocabulary ! We do not need to create one from dataset,
# It will use all of the indexer !!
vocab = Vocabulary()

## Create the index_instances from the batch, this will be used later by ELMO
dataset.index_instances(vocab)

"""
IMPORTANT: The ELMO uses just a character vocab in the interface.
It will compute the rest internally!

The ELMO words are padded to length 50 !
"""

character_ids = dataset.as_tensor_dict()['elmo']['character_ids']
print ("Shape of characters ids: ", character_ids.shape)
print ("Batch size: ", character_ids.shape[0])
print ("Maximum num words in batch: ", character_ids.shape[1])
print ("Maximum word length in dictionary: ", character_ids.shape[2])
#character_ids = batch_to_ids(sentences)

"""
Compute the Embeddings from the 
"""
embeddings = elmo(character_ids)

layer_1_values = embeddings["elmo_representations"][0]
layer_2_values = embeddings["elmo_representations"][1]
print ("Layer 1 representations: ", layer_1_values.shape)
print ("Layer 2 representations: ", layer_2_values.shape)