Ejemplo n.º 1
0
    def test_fit_predict_dump_load(self):
        """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts"""

        model = Model(self.pipeline)

        # Test attempting to predict before fitting
        with self.assertRaises(RuntimeError):
            model.predict('Lorem ipsum dolor sit amet.')

        model.fit(self.dataset,
                  groundtruth_directory=self.groundtruth_2_directory)
        # Test X and y data are set
        self.assertTrue(model.X_data)
        self.assertTrue(model.y_data)

        # Test that there is at least one prediction
        resulting_ann = model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)

        # Test prediction over directory
        resulting_dataset = model.predict(
            self.dataset.data_directory,
            prediction_directory=self.prediction_directory)
        self.assertIsInstance(resulting_dataset, Dataset)
        self.assertEqual(len(self.dataset), len(resulting_dataset))

        # Test that groundtruth is written
        groundtruth_dataset = Dataset(self.groundtruth_2_directory)
        expected = [d.file_name for d in self.dataset]
        actual = [d.file_name for d in groundtruth_dataset]
        self.assertListEqual(expected, actual)

        # Test that the groundtruth ann files have content
        for ann in groundtruth_dataset.generate_annotations():
            self.assertTrue(ann)

        # Test pickling a model
        pickle_path = os.path.join(self.prediction_directory, 'test.pkl')
        model.dump(pickle_path)
        new_model = Model(self.pipeline)
        new_model.load(pickle_path)

        # Test that there is at least one prediction
        resulting_ann = new_model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)
Ejemplo n.º 2
0
    def test_cross_validate_create_groundtruth_predictions(self):
        """
        Tests that during cross validation, the medaCy groundtruth (that is, the version of the training dataset
        used by medaCy) is written as well as the predictions that are created for each fold
        """
        model = Model(self.pipeline)
        model.cross_validate(self.dataset,
                             num_folds=2,
                             prediction_directory=self.prediction_directory_3,
                             groundtruth_directory=self.groundtruth_directory)

        prediction_dataset = Dataset(self.prediction_directory_3)
        groundtruth_dataset = Dataset(self.groundtruth_directory)

        for d in [prediction_dataset, groundtruth_dataset]:
            self.assertIsInstance(d, Dataset)

        original_file_names = {d.file_name for d in self.dataset}
        prediction_file_names = {d.file_name for d in prediction_dataset}
        groundtruth_file_names = {d.file_name for d in groundtruth_dataset}

        for n in [prediction_file_names, groundtruth_file_names]:
            self.assertSetEqual(n, original_file_names)

        # Container for all Annotations in all files in all folds
        all_anns_all_folds_actual = Annotations([])

        # Test that fold groundtruth is written to file
        for fold_name in ["fold_1", "fold_2"]:
            fold_dataset = Dataset(groundtruth_dataset.data_directory /
                                   fold_name)
            for d in fold_dataset:
                fold_ann = Annotations(d.ann_path)
                groundtruth_ann = groundtruth_dataset[d.file_name]
                # Test that the entities in the fold groundtruth are a subset of the whole for that file
                self.assertTrue(set(fold_ann) <= set(groundtruth_ann))
                all_anns_all_folds_actual |= fold_ann

        # Container for all annotations pulled directly from the groundtruth dataset
        all_groundtruth_tuples = Annotations([])
        for ann in groundtruth_dataset.generate_annotations():
            all_groundtruth_tuples |= ann

        expected = set(all_groundtruth_tuples)
        actual = set(all_anns_all_folds_actual)
        self.assertSetEqual(expected, actual)