def test_to_pandas(self): r = Resource(DATA_ROOT, ["sentiment", "text"], "sentiment") target, text = r.train_data(split_target=True) self.assertEqual(len(target), 1) self.assertEqual(len(text), 1) print(r.train_data().head(1))
def test_to_indexed(self): r = Resource(DATA_ROOT, ["sentiment", "text"], "sentiment") r_indexed = r.to_indexed().make_vocab(min_word_count=0) self.assertTrue(os.path.exists(r_indexed.vocab_file_path)) vocab = r_indexed.vocab_data() self.assertEqual(len(vocab), 4) # good/bad/sentence/unk (train + test) train_idx = r_indexed.train_data() self.assertEqual(len(train_idx), 1) os.remove(r_indexed.vocab_file_path)
def make_resource(self, data_root): if self.kind in ["polarity", "polarity_v1"]: return Resource(data_root, columns=["polarity", "review"], target="polarity") elif self.kind == "rating": return Resource(data_root, columns=["rating", "review"], target="rating") elif self.kind == "subjectivity": return Resource(data_root, columns=["subjectivity", "review"], target="subjectivity") else: return Resource(data_root)
def test_to_batch(self): r = Resource(DATA_ROOT, ["sentiment", "text", "score"], "sentiment") X, y = r.to_batch("train") self.assertEqual(X.shape, (4, 2)) self.assertEqual(y.shape, (4, 1)) r.make_vocab() r.column("text").as_word_seq(fixed_len=5) X, y = r.to_batch("train", columns=("sentiment", "text")) self.assertEqual(X.shape, (4, 5, len(r.vocab)))
def make_resource(self, data_root): return Resource(data_root, ["sentence"], pattern={ "train": ".train", "test": ".test", "valid": ".valid", "samples": "_samples" })
def chazutsu(self, path, columns=None, target="", separator="\t", pattern=()): from chazutsu.datasets.framework.resource import Resource r = Resource(path, columns, target, separator, pattern) return r
def test_to_batch_iter(self): r = Resource(DATA_ROOT, ["sentiment", "text", "score"], "sentiment") r.make_vocab() batch_size = 2 fixed_len = 5 r.column("text").as_word_seq(fixed_len=fixed_len) iterator, count = r.to_batch_iter("train", columns=("sentiment", "text"), batch_size=batch_size) self.assertEqual(count, batch_size) for i in range(4): X, y = next(iterator) self.assertEqual(y.shape, (batch_size, 1)) self.assertEqual(X.shape, (batch_size, fixed_len, len(r.vocab))) print(r.column("text").back(X))
def test_read_resource(self): r = Resource(DATA_ROOT) for t in self.TEST_FILES: file = self.TEST_FILES[t] path = os.path.join(DATA_ROOT, file) ans = "" if t == "train": ans = r.train_file_path elif t == "test": ans = r.test_file_path elif t == "sample": ans = r.sample_file_path elif t == "data": ans = r.data_file_path self.assertEqual(ans, path)
def make_resource(self, data_root): return Resource(data_root, columns=["news", "summary"], target="summary")
def make_resource(self, data_root): return Resource(data_root)
def make_resource(self, data_root): columns = ["group", "group-category", "subject", "author", "text"] return Resource(data_root, columns=columns, target="group")
def make_resource(self, data_root): if self.kind == "train": return Resource(data_root, columns=self.columns) elif self.kind == "dev": return Resource(data_root, columns=self.columns)
def make_resource(self, data_root): return Resource(data_root, columns=["sentence-type", "polarity", "detail", "review"], target="polarity")