def test_load_groups(self):
     raw_data = fake_data.FakeData()
     corpus = data.Corpus.from_data(raw_data)
     groups = corpus.load_groups()
     self.assertEqual(2, len(groups))
     expected = ['g1', 'g2']
     self.assertEqual(expected, [x.name for x in groups])
 def setUp(self):
     self.raw_data = fake_data.FakeData()
     self.corpus = data.Corpus.from_data(self.raw_data)
     self.corpus.create_folders(self.raw_data)
     self.dataset = bern_struc.Dataset(self.corpus, 'dev', 2)
     self.collate = bern_struc.Collate(2, self.corpus.vocab.probs(), 2)
     torch.random.manual_seed(42)
 def test_load(self):
     raw_data = fake_data.FakeData()
     corpus = data.Corpus.from_data(raw_data)
     corpus.save()
     corpus = data.Corpus.load('test')
     self.assertEqual('test', corpus.name)
     self.assertEqual(1, corpus.min_tok_count)
     self.assertEqual(24, corpus.n_vocab)
     self.assertEqual(0.5, corpus.subsample_threshold)
 def test_save(self):
     raw_data = fake_data.FakeData()
     corpus = data.Corpus.from_data(raw_data)
     corpus.save()
     with open(corpus.file_path(corpus.name)) as f:
         jdata = json.loads(f.read())
     self.assertEqual('test', jdata['name'])
     self.assertEqual(1, jdata['min_tok_count'])
     self.assertEqual(24, jdata['n_vocab'])
     self.assertEqual(0.5, jdata['subsample_threshold'])
 def test_parse_groups_and_docs(self):
     np.random.seed(42)
     raw_data = fake_data.FakeData()
     vocab = data.Corpus.create_vocab(raw_data)
     data.Corpus.parse_groups_and_docs(raw_data, vocab)
     # TODO: this really just tests it ran without error, details skipped
     g1 = data.Group.load('test', 'g1')
     g2 = data.Group.load('test', 'g2')
     self.assertEqual(3, g1.n_docs)
     self.assertEqual(3, g2.n_docs)
     self.assertEqual({'train': 9, 'dev': 9, 'test': 8}, g1.n_tokens)
     self.assertEqual({'train': 9, 'dev': 7, 'test': 5}, g2.n_tokens)
 def test_create_vocab(self):
     raw_data = fake_data.FakeData()
     vocab = data.Corpus.create_vocab(raw_data)
     expected = {
         'c': 1,
         'd': 1,
         'e': 2,
         'f': 2,
         'g': 3,
         'h': 3,
         'i': 3,
         'j': 3,
         'k': 3
     }
     self.assertEqual(expected, vocab.counts)
 def setUp(self):
     self.raw_data = fake_data.FakeData()
     data.Corpus.create_folders(self.raw_data)
 def test_create_doc_dict(self):
     raw_data = fake_data.FakeData()
     data.Corpus.create_doc_dict(raw_data)
     doc_dict = data.DocDict.load(raw_data.corpus_name)
     expected = ['g1.1', 'g1.2', 'g1.3', 'g2.1', 'g2.2', 'g2.3']
     self.assertEqual(expected, doc_dict.entities)
 def test_create_group_dict(self):
     raw_data = fake_data.FakeData()
     data.Corpus.create_group_dict(raw_data)
     group_dict = data.GroupDict.load(raw_data.corpus_name)
     expected = ['g1', 'g2']
     self.assertEqual(expected, group_dict.entities)
 def test_from_data(self):
     raw_data = fake_data.FakeData()
     corpus = data.Corpus.from_data(raw_data)
     # TODO: this also mostly tests absence of exceptions
     self.assertEqual(2, len(corpus.groups))
 def setUp(self):
     self.raw_data = fake_data.FakeData(min_tok_count=1, n_vocab=10)
     data.Corpus.create_folders(self.raw_data)
     np.random.seed(42)
     torch.random.manual_seed(42)
 def setUp(self):
     self.raw_data = fake_data.FakeData()
     data.Corpus.create_folders(self.raw_data)
     np.random.seed(42)
     self.corpus = data.Corpus.from_data(self.raw_data)