Esempio n. 1
0
def main():
    root = os.path.join(os.path.dirname(__file__), "../../")
    dataset = MultiNLIDataset(root)
    trainer = BaselineTrainer(root, log_dir="classifier_baseline")
    trainer.build()
    sequence_length = 25

    vocab_size = len(trainer.preprocessor.vocabulary.get())

    def preprocessor(x):
        _x = trainer.preprocess(x, sequence_length)
        return _x["text"]

    model = MergeClassifier(vocab_size)
    model.build(trainer.num_classes, preprocessor)

    metrics = trainer.train(model.model,
                            epochs=25,
                            sequence_length=sequence_length,
                            representation="GloVe.6B.100d")

    test_data = dataset.test_data()
    y_pred = model.predict(test_data["text"])

    print(
        classification_report(test_data["label"],
                              y_pred,
                              target_names=dataset.labels()))
Esempio n. 2
0
def main(graph_type="dependency", epochs=25):
    root = os.path.join(os.path.dirname(__file__), "../../")
    dataset = MultiNLIDataset(root)

    if graph_type == "dependency":
        graph_builder = DependencyGraph(lang="en")
    elif graph_type == "similarity":
        graph_builder = SimilarityGraph(lang="en")
    else:
        graph_builder = StaticGraph(lang="en")

    trainer = Trainer(graph_builder, root, log_dir="classifier")
    trainer.build()

    sequence_length = 25
    vocab_size = len(trainer.preprocessor.vocabulary.get())

    def preprocessor(x):
        _x = trainer.preprocess(x, sequence_length)
        values = (_x["text"], _x["graph"])
        return values

    model = GraphBasedClassifier(vocab_size, sequence_length, lstm=None)
    model.build(trainer.num_classes, preprocessor)

    metrics = trainer.train(model.model, epochs=epochs)

    test_data = dataset.test_data()
    y_pred = model.predict(test_data["text"])

    print(
        classification_report(test_data["label"],
                              y_pred,
                              target_names=dataset.labels()))
    def test_baseline(self):
        root = os.path.join(os.path.dirname(__file__), "../../")
        dataset = MultiNLIDataset(root)
        data = dataset.test_data()

        classifier = TfidfClassifier()
        scores = classifier.fit(data["text"], data["label"])
        self.assertTrue(len(scores) > 0)
Esempio n. 4
0
def main():
    root = os.path.join(os.path.dirname(__file__), "../../")
    dataset = MultiNLIDataset(root)
    classifier = TfidfClassifier()

    train_data = dataset.train_data()
    scores = classifier.fit(train_data["text"], train_data["label"])

    test_data = dataset.test_data()
    y_pred = classifier.predict(test_data["text"])

    print(classification_report(test_data["label"], y_pred,
                                target_names=dataset.labels()))
Esempio n. 5
0
    def _test_train(self, graph_type):
        root = os.path.join(os.path.dirname(__file__), "../../")
        sequence_length = 25
        heads = 3

        dataset = MultiNLIDataset(root)
        test_data = dataset.test_data()
        index = np.random.randint(len(test_data), size=1)[0]
        text = test_data["text"].iloc[index]

        graph_builder = None
        if graph_type == "dependency":
            graph_builder = DependencyGraph(lang="en")
        elif graph_type == "similarity":
            graph_builder = SimilarityGraph(lang="en")
        else:
            graph_builder = StaticGraph(lang="en")

        trainer = Trainer(graph_builder,
                          root,
                          preprocessor_name="test_ct_preprocessor")

        trainer.build(data_kind="test")

        def preprocessor(x):
            _x = trainer.preprocess(x, sequence_length)
            values = (_x["text"], _x["graph"])
            return values

        _, g = preprocessor([text])
        vocab_size = len(trainer.preprocessor.vocabulary.get())
        model = GraphBasedClassifier(vocab_size, sequence_length, heads=heads)
        model.build(trainer.num_classes, preprocessor)

        metrics = trainer.train(model.model, epochs=2)
        os.remove(trainer.preprocessor_path)
        self.assertTrue(
            metrics.history["acc"][-1] - metrics.history["acc"][0] > 0)

        attention = model.show_attention([text])
        self.assertEqual(len(attention), 1)  # batch size
        attention = attention[0]
        self.assertEqual(len(attention), 2)  # layer count
        attention = attention[0]
        self.assertEqual(attention.shape,
                         (heads, sequence_length, sequence_length))
    def test_download(self):
        root = os.path.join(os.path.dirname(__file__), "../../")
        dataset = MultiNLIDataset(root, prefix="test")
        dataset.download()

        train_data = dataset.train_data()
        test_data = dataset.test_data()

        for d in [train_data, test_data]:
            self.assertTrue(len(d) > 0)
            counts = d["label"].value_counts().values.tolist()
            c = counts[0]
            for _c in counts:
                self.assertEqual(c, _c)

        for k in ["train", "test"]:
            self.assertTrue(os.path.exists(dataset.interim_file(k)))
            os.remove(dataset.interim_file(k))

            self.assertTrue(os.path.exists(dataset.processed_file(k)))
            os.remove(dataset.processed_file(k))
Esempio n. 7
0
 def num_classes(self):
     return len(MultiNLIDataset.labels())
Esempio n. 8
0
 def download(self):
     r = MultiNLIDataset(self.storage.root).download()
     return r