예제 #1
0
    def test_to_pandas(self):
        r = Resource(DATA_ROOT, ["sentiment", "text"], "sentiment")
        target, text = r.train_data(split_target=True)
        self.assertEqual(len(target), 1)
        self.assertEqual(len(text), 1)

        print(r.train_data().head(1))
예제 #2
0
    def test_to_indexed(self):
        r = Resource(DATA_ROOT, ["sentiment", "text"], "sentiment")
        r_indexed = r.to_indexed().make_vocab(min_word_count=0)

        self.assertTrue(os.path.exists(r_indexed.vocab_file_path))
        vocab = r_indexed.vocab_data()
        self.assertEqual(len(vocab), 4)  # good/bad/sentence/unk (train + test)
        train_idx = r_indexed.train_data()
        self.assertEqual(len(train_idx), 1)
        os.remove(r_indexed.vocab_file_path)
예제 #3
0
파일: movie_review.py 프로젝트: yk/chazutsu
 def make_resource(self, data_root):
     if self.kind in ["polarity", "polarity_v1"]:
         return Resource(data_root,
                         columns=["polarity", "review"],
                         target="polarity")
     elif self.kind == "rating":
         return Resource(data_root,
                         columns=["rating", "review"],
                         target="rating")
     elif self.kind == "subjectivity":
         return Resource(data_root,
                         columns=["subjectivity", "review"],
                         target="subjectivity")
     else:
         return Resource(data_root)
예제 #4
0
 def test_to_batch(self):
     r = Resource(DATA_ROOT, ["sentiment", "text", "score"], "sentiment")
     X, y = r.to_batch("train")
     self.assertEqual(X.shape, (4, 2))
     self.assertEqual(y.shape, (4, 1))
     r.make_vocab()
     r.column("text").as_word_seq(fixed_len=5)
     X, y = r.to_batch("train", columns=("sentiment", "text"))
     self.assertEqual(X.shape, (4, 5, len(r.vocab)))
예제 #5
0
파일: wikitext2.py 프로젝트: yk/chazutsu
 def make_resource(self, data_root):
     return Resource(data_root, ["sentence"],
                     pattern={
                         "train": ".train",
                         "test": ".test",
                         "valid": ".valid",
                         "samples": "_samples"
                     })
예제 #6
0
    def chazutsu(self,
                 path,
                 columns=None,
                 target="",
                 separator="\t",
                 pattern=()):

        from chazutsu.datasets.framework.resource import Resource
        r = Resource(path, columns, target, separator, pattern)
        return r
예제 #7
0
 def test_to_batch_iter(self):
     r = Resource(DATA_ROOT, ["sentiment", "text", "score"], "sentiment")
     r.make_vocab()
     batch_size = 2
     fixed_len = 5
     r.column("text").as_word_seq(fixed_len=fixed_len)
     iterator, count = r.to_batch_iter("train",
                                       columns=("sentiment", "text"),
                                       batch_size=batch_size)
     self.assertEqual(count, batch_size)
     for i in range(4):
         X, y = next(iterator)
         self.assertEqual(y.shape, (batch_size, 1))
         self.assertEqual(X.shape, (batch_size, fixed_len, len(r.vocab)))
         print(r.column("text").back(X))
예제 #8
0
 def test_read_resource(self):
     r = Resource(DATA_ROOT)
     for t in self.TEST_FILES:
         file = self.TEST_FILES[t]
         path = os.path.join(DATA_ROOT, file)
         ans = ""
         if t == "train":
             ans = r.train_file_path
         elif t == "test":
             ans = r.test_file_path
         elif t == "sample":
             ans = r.sample_file_path
         elif t == "data":
             ans = r.data_file_path
         self.assertEqual(ans, path)
예제 #9
0
 def make_resource(self, data_root):
     return Resource(data_root,
                     columns=["news", "summary"],
                     target="summary")
예제 #10
0
 def make_resource(self, data_root):
     return Resource(data_root)
예제 #11
0
파일: news_group20.py 프로젝트: yk/chazutsu
 def make_resource(self, data_root):
     columns = ["group", "group-category", "subject", "author", "text"]
     return Resource(data_root, columns=columns, target="group")
예제 #12
0
 def make_resource(self, data_root):
     if self.kind == "train":
         return Resource(data_root, columns=self.columns)
     elif self.kind == "dev":
         return Resource(data_root, columns=self.columns)
예제 #13
0
 def make_resource(self, data_root):
     return Resource(data_root, columns=["sentence-type", "polarity", "detail", "review"], target="polarity")