Example #1
0
    def _nodes_to_batch(self, nodes):
        """
        Creates a batch from the passed nodes.

        Parameters
        ----------
        nodes : list(Node)
            Nodes that should be contained in the batch

        Returns
        -------
        (Batch)
            a Batch instance containing numericalized Field data.
        """

        batch_dict = defaultdict(list)

        for node in nodes:
            # all examples that make up the current node's context
            node_context_examples = self._get_node_context(node)
            node_context_dataset = Dataset(node_context_examples,
                                           self._dataset.fields)
            sub_batch = super()._create_batch(node_context_dataset)

            for key in sub_batch.keys():
                value = getattr(sub_batch, key)
                batch_dict[key].append(value)

        batch = Batch(batch_dict)

        return batch
Example #2
0
    def as_flat_dataset(self):
        """
        Returns a standard Dataset containing the examples in order as defined
        in 'flatten'.

        Returns
        -------
        Dataset
            a standard Dataset
        """
        return Dataset(list(self.flatten()), self.field_dict)
Example #3
0
def test_from_pandas_index(data):
    df = pd.DataFrame([[x[0]] for x in data], index=[x[1] for x in data])
    fields = [Field("text", keep_raw=True, tokenizer="split")]

    ds = Dataset.from_pandas(df,
                             fields,
                             index_field=Field("numbers",
                                               tokenizer=None,
                                               keep_raw=True))

    for original, (raw, _) in zip(data, ds.numbers):
        assert original[1] == raw
Example #4
0
def test_from_pandas_field_dict(data):
    df = pd.DataFrame(data, columns=["text", "number"])
    fields = {
        "text": Field("text_field", keep_raw=True, tokenizer="split"),
        "number": Field("number_field", tokenizer=None),
    }

    ds = Dataset.from_pandas(df, fields)
    assert set(ds.field_dict) == set(["text_field", "number_field"])

    for original, (raw, _) in zip(data, ds.text_field):
        assert original[0] == raw
Example #5
0
def test_from_pandas_field_list(data):

    df = pd.DataFrame(data)
    fields = [
        Field("text", keep_raw=True, tokenizer="split"),
        Field("number", tokenizer=None),
    ]

    ds = Dataset.from_pandas(df, fields)

    for original, (raw, _) in zip(data, ds.text):
        assert original[0] == raw
Example #6
0
    def predict(self,
                dataset: Dataset,
                batch_size: int = 128,
                **kwargs) -> np.ndarray:
        """
        Computes the prediction of the model for every example in the provided
        dataset.

        Parameters
        ----------
        dataset : Dataset
            Dataset to compute predictions for.

        batch_size : int
            If None, predictions for the whole dataset will be done in a single batch.
            Else, predictions will be calculated in batches of batch_size size.
            This argument is useful in case the whole dataset can't be processed in a
            single batch.

        kwargs
            Keyword arguments passed to the model's `predict` method

        Returns
        -------
        ndarray
            Tensor containing predictions for examples in the passed Dataset.
        """
        # TODO: new method of providing examples must be defined.
        # examples is taken in dataset form as proof-of-concept.
        self._check_if_model_exists()

        y = []
        prediction_key = AbstractSupervisedModel.PREDICTION_KEY

        if batch_size is None:
            x_batch_tensor = self.feature_transformer.transform(
                dataset.batch())
            batch_prediction = self.model.predict(x_batch_tensor, **kwargs)
            prediction_tensor = batch_prediction[prediction_key]
            return prediction_tensor
        else:
            prediction_iterator = Iterator(batch_size=batch_size,
                                           shuffle=False)

            for batch in prediction_iterator(dataset):
                x_batch_tensor = self.feature_transformer.transform(batch)
                batch_prediction = self.model.predict(x_batch_tensor, **kwargs)
                prediction_tensor = batch_prediction[prediction_key]
                y.append(prediction_tensor)

            return np.concatenate(y)
Example #7
0
    def create_dataset():
        fields = (
            Field("text", numericalizer=Vocab()),
            Field("source", numericalizer=Vocab(), tokenizer=list),
        )
        example_factory = ExampleFactory(fields)

        examples = [
            example_factory.from_list(data)
            for data in zip(TABULAR_TEXT, TABULAR_SOURCES)
        ]

        dataset = Dataset(examples, fields)
        return dataset
Example #8
0
def create_dataset(data, field_list):
    examples = [MockExample(field_list, d) for d in data]

    return Dataset(examples, field_list)