コード例 #1
0
 def _create_batches(self, dataset: Dataset, shuffle: bool) -> List[List[Instance]]:
     instances = dataset.instances
     if shuffle:
         random.shuffle(instances)
     grouped_instances = group_by_count(instances, self._batch_size, None)
     # The last group might have not been full, so we check if any of the instances
     # are None, which is how group_by_count pads non-complete batches.
     grouped_instances[-1] = [instance for instance in grouped_instances[-1] if instance is not None]
     return grouped_instances
コード例 #2
0
 def _create_batches(self, instances: Iterable[Instance],
                     shuffle: bool) -> Iterable[Batch]:
     instances = ensure_list(instances)
     if shuffle:
         random.shuffle(instances)
     grouped_instances = group_by_count(instances, self._batch_size, None)
     # The last group might have not been full, so we check if any of the instances
     # are None, which is how group_by_count pads non-complete batches.
     grouped_instances[-1] = [
         instance for instance in grouped_instances[-1]
         if instance is not None
     ]
     return (Batch(batch) for batch in grouped_instances)
コード例 #3
0
ファイル: util_test.py プロジェクト: matasuke/allennlp
 def test_group_by_count(self):
     assert util.group_by_count([1, 2, 3, 4, 5, 6, 7], 3, 20) == [
         [1, 2, 3],
         [4, 5, 6],
         [7, 20, 20],
     ]
コード例 #4
0
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs]

        flattened_instances = [instance for sentence_instances in instances_per_sentence
                               for instance in sentence_instances]

        if not flattened_instances:
            return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])}
                             for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size, None)
        batched_instances[-1] = [instance for instance in batched_instances[-1]
                                 if instance is not None]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output['tags']
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                        "verb": output["verb"],
                        "description": description,
                        "tags": tags,
                })
                output_index += 1

        return sanitize(return_dicts)
コード例 #5
0
ファイル: test_util.py プロジェクト: pyknife/allennlp
 def test_group_by_count(self):
     assert util.group_by_count([1, 2, 3, 4, 5, 6, 7], 3, 20) == [[1, 2, 3], [4, 5, 6], [7, 20, 20]]
コード例 #6
0
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence, return_dicts = zip(
            *[self._sentence_to_srl_instances(json) for json in inputs])

        flattened_instances = [
            instance for sentence_instances in instances_per_sentence
            for instance in sentence_instances
        ]

        if not flattened_instances:
            return sanitize(return_dicts)

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size,
                                           None)
        batched_instances[-1] = [
            instance for instance in batched_instances[-1]
            if instance is not None
        ]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        sentence_index = 0
        for results in return_dicts:
            # We just added the verbs to the list in _sentence_to_srl_instances
            # but we actually want to replace them with their frames, so we
            # reset them here.
            verbs_for_sentence: List[str] = results["verbs"]
            results["verbs"] = []
            # The verbs are in order, but nested as we have multiple sentences.
            # The outputs are already flattened from running through the model,
            # so we just index into this flat list for each verb, updating as we go.
            for verb in verbs_for_sentence:
                output = outputs[sentence_index]
                tags = output['tags']
                description = self.make_srl_string(results["words"], tags)
                results["verbs"].append({
                    "verb": verb,
                    "description": description,
                    "tags": tags,
                })
                sentence_index += 1

        return sanitize(return_dicts)
コード例 #7
0
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [
            self._sentence_to_srl_instances(json) for json in inputs
        ]

        flattened_instances = [
            instance for sentence_instances in instances_per_sentence
            for instance in sentence_instances
        ]

        if not flattened_instances:
            return sanitize([{
                "verbs": [],
                "words":
                self._tokenizer.split_words(x["sentence"])
            } for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size,
                                           None)
        batched_instances[-1] = [
            instance for instance in batched_instances[-1]
            if instance is not None
        ]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.split_words(
                    inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output['tags']
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                    "verb":
                    output["verb"],
                    "description":
                    description,
                    "tags":
                    tags,
                })
                output_index += 1

        return sanitize(return_dicts)