def test_load_groups(self): raw_data = fake_data.FakeData() corpus = data.Corpus.from_data(raw_data) groups = corpus.load_groups() self.assertEqual(2, len(groups)) expected = ['g1', 'g2'] self.assertEqual(expected, [x.name for x in groups])
def setUp(self): self.raw_data = fake_data.FakeData() self.corpus = data.Corpus.from_data(self.raw_data) self.corpus.create_folders(self.raw_data) self.dataset = bern_struc.Dataset(self.corpus, 'dev', 2) self.collate = bern_struc.Collate(2, self.corpus.vocab.probs(), 2) torch.random.manual_seed(42)
def test_load(self): raw_data = fake_data.FakeData() corpus = data.Corpus.from_data(raw_data) corpus.save() corpus = data.Corpus.load('test') self.assertEqual('test', corpus.name) self.assertEqual(1, corpus.min_tok_count) self.assertEqual(24, corpus.n_vocab) self.assertEqual(0.5, corpus.subsample_threshold)
def test_save(self): raw_data = fake_data.FakeData() corpus = data.Corpus.from_data(raw_data) corpus.save() with open(corpus.file_path(corpus.name)) as f: jdata = json.loads(f.read()) self.assertEqual('test', jdata['name']) self.assertEqual(1, jdata['min_tok_count']) self.assertEqual(24, jdata['n_vocab']) self.assertEqual(0.5, jdata['subsample_threshold'])
def test_parse_groups_and_docs(self): np.random.seed(42) raw_data = fake_data.FakeData() vocab = data.Corpus.create_vocab(raw_data) data.Corpus.parse_groups_and_docs(raw_data, vocab) # TODO: this really just tests it ran without error, details skipped g1 = data.Group.load('test', 'g1') g2 = data.Group.load('test', 'g2') self.assertEqual(3, g1.n_docs) self.assertEqual(3, g2.n_docs) self.assertEqual({'train': 9, 'dev': 9, 'test': 8}, g1.n_tokens) self.assertEqual({'train': 9, 'dev': 7, 'test': 5}, g2.n_tokens)
def test_create_vocab(self): raw_data = fake_data.FakeData() vocab = data.Corpus.create_vocab(raw_data) expected = { 'c': 1, 'd': 1, 'e': 2, 'f': 2, 'g': 3, 'h': 3, 'i': 3, 'j': 3, 'k': 3 } self.assertEqual(expected, vocab.counts)
def setUp(self): self.raw_data = fake_data.FakeData() data.Corpus.create_folders(self.raw_data)
def test_create_doc_dict(self): raw_data = fake_data.FakeData() data.Corpus.create_doc_dict(raw_data) doc_dict = data.DocDict.load(raw_data.corpus_name) expected = ['g1.1', 'g1.2', 'g1.3', 'g2.1', 'g2.2', 'g2.3'] self.assertEqual(expected, doc_dict.entities)
def test_create_group_dict(self): raw_data = fake_data.FakeData() data.Corpus.create_group_dict(raw_data) group_dict = data.GroupDict.load(raw_data.corpus_name) expected = ['g1', 'g2'] self.assertEqual(expected, group_dict.entities)
def test_from_data(self): raw_data = fake_data.FakeData() corpus = data.Corpus.from_data(raw_data) # TODO: this also mostly tests absence of exceptions self.assertEqual(2, len(corpus.groups))
def setUp(self): self.raw_data = fake_data.FakeData(min_tok_count=1, n_vocab=10) data.Corpus.create_folders(self.raw_data) np.random.seed(42) torch.random.manual_seed(42)
def setUp(self): self.raw_data = fake_data.FakeData() data.Corpus.create_folders(self.raw_data) np.random.seed(42) self.corpus = data.Corpus.from_data(self.raw_data)