Exemple #1
0
    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            Example::
                [
                    [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                    [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                    ...
                ]

        :return: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            instance = Instance()
            instance.add_field("premise", p)
            instance.add_field("hypothesis", h)
            instance.add_field("truth", l)
            data_set.append(instance)
        data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len")
        data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len")
        data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
        data_set.set_target("truth")
        return data_set
Exemple #2
0
def generate_fake_dataset(num_samples=1000):
    """
    产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]}
    :param num_samples: sample的数量
    :return:
    """

    max_len = 50
    min_len = 10
    num_features = 4

    data_dict = {}
    for i in range(num_features):
        data = []
        lengths = np.random.randint(min_len, max_len, size=(num_samples))
        for length in lengths:
            data.append(np.random.randint(100, size=length))
        data_dict[str(i)] = data

    dataset = DataSet(data_dict)

    for i in range(num_features):
        if np.random.randint(2) == 0:
            dataset.set_input(str(i))
        else:
            dataset.set_target(str(i))
    return dataset
Exemple #3
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
Exemple #4
0
 def test_numpy_padding(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertEqual(x["x"].shape, (4, 4))
         self.assertEqual(y["y"].shape, (4, 4))
Exemple #5
0
 def convert(self, data):
     data_set = DataSet()
     for item in data:
         sent_words, sent_pos_tag = item[0], item[1]
         data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
     data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
     data_set.set_target("tags")
     data_set.set_input("sent_words")
     data_set.set_input("seq_len")
     return data_set
Exemple #6
0
    def test_input_target(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        self.assertTrue(ds.field_arrays["x"].is_input)
        self.assertTrue(ds.field_arrays["y"].is_target)

        with self.assertRaises(KeyError):
            ds.set_input("xxx")
        with self.assertRaises(KeyError):
            ds.set_input("yyy")
Exemple #7
0
 def test_dataset_batching(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
         self.assertEqual(len(x["x"]), 4)
         self.assertEqual(len(y["y"]), 4)
         self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
         self.assertListEqual(list(y["y"][-1]), [5, 6])
Exemple #8
0
 def test_numpy_to_tensor(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Exemple #9
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))