def test_ng_style(self):
        dataset = TrivialDataset(set_size=100)
        train_set, dev_set, test_set = dataset_parties.ng_style(dataset)
        self.assertEqual(len(train_set), 60)
        self.assertEqual(len(dev_set), 20)
        self.assertEqual(len(test_set), 20)

        dataset = TrivialDataset(set_size=50010)
        train_set, dev_set, test_set = dataset_parties.ng_style(dataset)
        self.assertEqual(len(train_set), 30010)
        self.assertEqual(len(dev_set), 10000)
        self.assertEqual(len(test_set), 10000)
    def test_ng_style(self):
        dataset = LimitedNatureDataset(set_size=(10))
        train_set, dev_set, test_set = dataset_parties.ng_style(dataset)
        self.assertEqual(len(train_set), 6)
        self.assertEqual(len(dev_set), 2)
        self.assertEqual(len(test_set), 2)
        self.assertEqual(train_set[5], dataset[5])
        self.assertEqual(dev_set[0], dataset[6])
        self.assertEqual(test_set[1], dataset[9])

        dataset = LimitedNatureDataset(set_size=(50003))
        train_set, dev_set, test_set = dataset_parties.ng_style(dataset)
        self.assertEqual(len(train_set), 30003)
        self.assertEqual(len(dev_set), 10000)
        self.assertEqual(len(test_set), 10000)
 def test_with_a_dataset_containing_over_50000_samples(self):
     datasetA = range(0, 284315)
     datasetB = range(284315, 284315 + 492)
     train_set, dev_set, test_set = dataset_parties.ng_style(
         (datasetA, datasetB))
     self.assertEqual(len(train_set), 264807)
     self.assertEqual(len(dev_set), 10000)
     self.assertEqual(len(test_set), 10000)
    def test_resulting_sets_distinct(self):
        datasetA = range(0, 5000)
        datasetB = range(5000, 15000)
        datasetC = range(15000, 20000)

        partitions = dataset_parties.ng_style((datasetA, datasetB, datasetC))
        train_set_set, dev_set_set, test_set_set = [
            set(dataset) for dataset in partitions
        ]

        self.assertEqual(sum([len(partition) for partition in partitions]),
                         20000)

        self.assertEqual(len(train_set_set.intersection(dev_set_set)), 0)
        self.assertEqual(len(dev_set_set.intersection(test_set_set)), 0)
        self.assertEqual(len(train_set_set.intersection(test_set_set)), 0)
    def test_with_several_datasets(self):
        datasetA = ['A'] * 5000
        datasetB = ['B'] * 15000
        datasetC = ['C'] * 80000

        train_set, dev_set, test_set = dataset_parties.ng_style(
            (datasetA, datasetB, datasetC))

        self.assertEqual(train_set.count('A'), 4000)
        self.assertEqual(train_set.count('B'), 12000)
        self.assertEqual(train_set.count('C'), 64000)

        self.assertEqual(dev_set.count('A'), 500)
        self.assertEqual(dev_set.count('B'), 1500)
        self.assertEqual(dev_set.count('C'), 8000)

        self.assertEqual(test_set.count('A'), 500)
        self.assertEqual(test_set.count('B'), 1500)
        self.assertEqual(test_set.count('C'), 8000)
 def test_with_a_small_dataset(self):
     dataset = LimitedNatureDataset(set_size=(10))
     train_set, dev_set, test_set = dataset_parties.ng_style(dataset)
     self.assertEqual(len(train_set), 6)
     self.assertEqual(len(dev_set), 2)
     self.assertEqual(len(test_set), 2)
 def test_with_a_dataset_containing_over_50000_samples(self):
     dataset = LimitedNatureDataset(set_size=(50013))
     train_set, dev_set, test_set = dataset_parties.ng_style(dataset)
     self.assertEqual(len(train_set), 30013)
     self.assertEqual(len(dev_set), 10000)
     self.assertEqual(len(test_set), 10000)
Example #8
0
import dataset_parties
from datasets.sequences_dataset import SequencesDataset

dataset = SequencesDataset()

train_set, dev_set, test_set = dataset_parties.ng_style(dataset)