def main(): root = os.path.join(os.path.dirname(__file__), "../../") dataset = MultiNLIDataset(root) trainer = BaselineTrainer(root, log_dir="classifier_baseline") trainer.build() sequence_length = 25 vocab_size = len(trainer.preprocessor.vocabulary.get()) def preprocessor(x): _x = trainer.preprocess(x, sequence_length) return _x["text"] model = MergeClassifier(vocab_size) model.build(trainer.num_classes, preprocessor) metrics = trainer.train(model.model, epochs=25, sequence_length=sequence_length, representation="GloVe.6B.100d") test_data = dataset.test_data() y_pred = model.predict(test_data["text"]) print( classification_report(test_data["label"], y_pred, target_names=dataset.labels()))
def main(graph_type="dependency", epochs=25): root = os.path.join(os.path.dirname(__file__), "../../") dataset = MultiNLIDataset(root) if graph_type == "dependency": graph_builder = DependencyGraph(lang="en") elif graph_type == "similarity": graph_builder = SimilarityGraph(lang="en") else: graph_builder = StaticGraph(lang="en") trainer = Trainer(graph_builder, root, log_dir="classifier") trainer.build() sequence_length = 25 vocab_size = len(trainer.preprocessor.vocabulary.get()) def preprocessor(x): _x = trainer.preprocess(x, sequence_length) values = (_x["text"], _x["graph"]) return values model = GraphBasedClassifier(vocab_size, sequence_length, lstm=None) model.build(trainer.num_classes, preprocessor) metrics = trainer.train(model.model, epochs=epochs) test_data = dataset.test_data() y_pred = model.predict(test_data["text"]) print( classification_report(test_data["label"], y_pred, target_names=dataset.labels()))
def test_baseline(self): root = os.path.join(os.path.dirname(__file__), "../../") dataset = MultiNLIDataset(root) data = dataset.test_data() classifier = TfidfClassifier() scores = classifier.fit(data["text"], data["label"]) self.assertTrue(len(scores) > 0)
def main(): root = os.path.join(os.path.dirname(__file__), "../../") dataset = MultiNLIDataset(root) classifier = TfidfClassifier() train_data = dataset.train_data() scores = classifier.fit(train_data["text"], train_data["label"]) test_data = dataset.test_data() y_pred = classifier.predict(test_data["text"]) print(classification_report(test_data["label"], y_pred, target_names=dataset.labels()))
def _test_train(self, graph_type): root = os.path.join(os.path.dirname(__file__), "../../") sequence_length = 25 heads = 3 dataset = MultiNLIDataset(root) test_data = dataset.test_data() index = np.random.randint(len(test_data), size=1)[0] text = test_data["text"].iloc[index] graph_builder = None if graph_type == "dependency": graph_builder = DependencyGraph(lang="en") elif graph_type == "similarity": graph_builder = SimilarityGraph(lang="en") else: graph_builder = StaticGraph(lang="en") trainer = Trainer(graph_builder, root, preprocessor_name="test_ct_preprocessor") trainer.build(data_kind="test") def preprocessor(x): _x = trainer.preprocess(x, sequence_length) values = (_x["text"], _x["graph"]) return values _, g = preprocessor([text]) vocab_size = len(trainer.preprocessor.vocabulary.get()) model = GraphBasedClassifier(vocab_size, sequence_length, heads=heads) model.build(trainer.num_classes, preprocessor) metrics = trainer.train(model.model, epochs=2) os.remove(trainer.preprocessor_path) self.assertTrue( metrics.history["acc"][-1] - metrics.history["acc"][0] > 0) attention = model.show_attention([text]) self.assertEqual(len(attention), 1) # batch size attention = attention[0] self.assertEqual(len(attention), 2) # layer count attention = attention[0] self.assertEqual(attention.shape, (heads, sequence_length, sequence_length))
def test_download(self): root = os.path.join(os.path.dirname(__file__), "../../") dataset = MultiNLIDataset(root, prefix="test") dataset.download() train_data = dataset.train_data() test_data = dataset.test_data() for d in [train_data, test_data]: self.assertTrue(len(d) > 0) counts = d["label"].value_counts().values.tolist() c = counts[0] for _c in counts: self.assertEqual(c, _c) for k in ["train", "test"]: self.assertTrue(os.path.exists(dataset.interim_file(k))) os.remove(dataset.interim_file(k)) self.assertTrue(os.path.exists(dataset.processed_file(k))) os.remove(dataset.processed_file(k))
def num_classes(self): return len(MultiNLIDataset.labels())
def download(self): r = MultiNLIDataset(self.storage.root).download() return r