def _nodes_to_batch(self, nodes): """ Creates a batch from the passed nodes. Parameters ---------- nodes : list(Node) Nodes that should be contained in the batch Returns ------- (Batch) a Batch instance containing numericalized Field data. """ batch_dict = defaultdict(list) for node in nodes: # all examples that make up the current node's context node_context_examples = self._get_node_context(node) node_context_dataset = Dataset(node_context_examples, self._dataset.fields) sub_batch = super()._create_batch(node_context_dataset) for key in sub_batch.keys(): value = getattr(sub_batch, key) batch_dict[key].append(value) batch = Batch(batch_dict) return batch
def as_flat_dataset(self): """ Returns a standard Dataset containing the examples in order as defined in 'flatten'. Returns ------- Dataset a standard Dataset """ return Dataset(list(self.flatten()), self.field_dict)
def test_from_pandas_index(data): df = pd.DataFrame([[x[0]] for x in data], index=[x[1] for x in data]) fields = [Field("text", keep_raw=True, tokenizer="split")] ds = Dataset.from_pandas(df, fields, index_field=Field("numbers", tokenizer=None, keep_raw=True)) for original, (raw, _) in zip(data, ds.numbers): assert original[1] == raw
def test_from_pandas_field_dict(data): df = pd.DataFrame(data, columns=["text", "number"]) fields = { "text": Field("text_field", keep_raw=True, tokenizer="split"), "number": Field("number_field", tokenizer=None), } ds = Dataset.from_pandas(df, fields) assert set(ds.field_dict) == set(["text_field", "number_field"]) for original, (raw, _) in zip(data, ds.text_field): assert original[0] == raw
def test_from_pandas_field_list(data): df = pd.DataFrame(data) fields = [ Field("text", keep_raw=True, tokenizer="split"), Field("number", tokenizer=None), ] ds = Dataset.from_pandas(df, fields) for original, (raw, _) in zip(data, ds.text): assert original[0] == raw
def predict(self, dataset: Dataset, batch_size: int = 128, **kwargs) -> np.ndarray: """ Computes the prediction of the model for every example in the provided dataset. Parameters ---------- dataset : Dataset Dataset to compute predictions for. batch_size : int If None, predictions for the whole dataset will be done in a single batch. Else, predictions will be calculated in batches of batch_size size. This argument is useful in case the whole dataset can't be processed in a single batch. kwargs Keyword arguments passed to the model's `predict` method Returns ------- ndarray Tensor containing predictions for examples in the passed Dataset. """ # TODO: new method of providing examples must be defined. # examples is taken in dataset form as proof-of-concept. self._check_if_model_exists() y = [] prediction_key = AbstractSupervisedModel.PREDICTION_KEY if batch_size is None: x_batch_tensor = self.feature_transformer.transform( dataset.batch()) batch_prediction = self.model.predict(x_batch_tensor, **kwargs) prediction_tensor = batch_prediction[prediction_key] return prediction_tensor else: prediction_iterator = Iterator(batch_size=batch_size, shuffle=False) for batch in prediction_iterator(dataset): x_batch_tensor = self.feature_transformer.transform(batch) batch_prediction = self.model.predict(x_batch_tensor, **kwargs) prediction_tensor = batch_prediction[prediction_key] y.append(prediction_tensor) return np.concatenate(y)
def create_dataset(): fields = ( Field("text", numericalizer=Vocab()), Field("source", numericalizer=Vocab(), tokenizer=list), ) example_factory = ExampleFactory(fields) examples = [ example_factory.from_list(data) for data in zip(TABULAR_TEXT, TABULAR_SOURCES) ] dataset = Dataset(examples, fields) return dataset
def create_dataset(data, field_list): examples = [MockExample(field_list, d) for d in data] return Dataset(examples, field_list)