Example #1
0
    def test_fit_predict_dump_load(self):
        """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts"""

        model = Model(self.pipeline)

        # Test attempting to predict before fitting
        with self.assertRaises(RuntimeError):
            model.predict('Lorem ipsum dolor sit amet.')

        model.fit(self.dataset,
                  groundtruth_directory=self.groundtruth_2_directory)
        # Test X and y data are set
        self.assertTrue(model.X_data)
        self.assertTrue(model.y_data)

        # Test that there is at least one prediction
        resulting_ann = model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)

        # Test prediction over directory
        resulting_dataset = model.predict(
            self.dataset.data_directory,
            prediction_directory=self.prediction_directory)
        self.assertIsInstance(resulting_dataset, Dataset)
        self.assertEqual(len(self.dataset), len(resulting_dataset))

        # Test that groundtruth is written
        groundtruth_dataset = Dataset(self.groundtruth_2_directory)
        expected = [d.file_name for d in self.dataset]
        actual = [d.file_name for d in groundtruth_dataset]
        self.assertListEqual(expected, actual)

        # Test that the groundtruth ann files have content
        for ann in groundtruth_dataset.generate_annotations():
            self.assertTrue(ann)

        # Test pickling a model
        pickle_path = os.path.join(self.prediction_directory, 'test.pkl')
        model.dump(pickle_path)
        new_model = Model(self.pipeline)
        new_model.load(pickle_path)

        # Test that there is at least one prediction
        resulting_ann = new_model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)
Example #2
0
    def test_predict(self):
        """
        predict() has different functionality depending on what is passed to it; therefore this test
        ensures that each type of input is handled correctly
        """

        # Init the Model
        pipe = TestingPipeline(entities=self.entities)
        sample_model_path = os.path.join(test_dir, 'sample_models',
                                         'sample_test_pipe.pkl')
        model = Model(pipe)
        model.load(sample_model_path)

        # Test passing a Dataset
        dataset_output = model.predict(self.dataset)
        self.assertIsInstance(dataset_output, Dataset)
        self.assertEqual(len(dataset_output), len(self.dataset))

        # Test passing a directory
        directory_output = model.predict(self.dataset.data_directory)
        self.assertIsInstance(directory_output, Dataset)
        self.assertEqual(len(directory_output), len(self.dataset))

        # Test passing a string
        string_output = model.predict('This is a sample string.')
        self.assertIsInstance(string_output, Annotations)

        # Test that the predictions are written to the expected location when no path is provided
        expected_dir = os.path.join(self.dataset.data_directory, 'predictions')
        self.assertTrue(os.path.isdir(expected_dir))

        # Delete that directory
        shutil.rmtree(expected_dir)

        # Test predicting to a specific directory
        model.predict(self.dataset.data_directory,
                      prediction_directory=self.prediction_directory_2)
        expected_files = os.listdir(self.prediction_directory_2)
        self.assertEqual(6, len(expected_files))
Example #3
0
    def test_prediction_with_testing_pipeline(self):
        """Tests that a model created with the BiLSTM+CRF can be fitted and used to predict"""
        pipeline = LstmSystematicReviewPipeline(
            entities=self.entities,
            word_embeddings=word_embeddings,
            cuda_device=cuda_device)

        model = Model(pipeline)
        model.fit(self.dataset)
        resulting_dataset = model.predict(
            self.dataset, prediction_directory=self.prediction_directory)
        self.assertIsInstance(resulting_dataset, Dataset)
        # Test that there is at least one prediction
        if not any(resulting_dataset.generate_annotations()):
            warn("The model did not generate any predictions")
Example #4
0
    def test_cross_validate_fit_predict(self):
        """Tests that a model created with BERT can be fitted and used to predict, with and without the CRF layer"""
        pipeline = BertPipeline(entities=self.entities,
                                pretrained_model='bert-base-cased',
                                batch_size=self.batch_size,
                                cuda_device=cuda_device)

        pipeline_crf = BertPipeline(entities=self.entities,
                                    pretrained_model='bert-base-cased',
                                    batch_size=self.batch_size,
                                    cuda_device=cuda_device,
                                    using_crf=True)

        for pipe in [pipeline, pipeline_crf]:
            model = Model(pipe)
            model.cross_validate(self.dataset, 2)
            model.fit(self.dataset)
            resulting_dataset = model.predict(
                self.dataset, prediction_directory=self.prediction_directory)
            self.assertIsInstance(resulting_dataset, Dataset)
            # Test that there is at least one prediction
            if not any(resulting_dataset.generate_annotations()):
                warn("The model did not generate any predictions")
Example #5
0
pipeline = SystematicReviewPipeline(entities=entities, use_metamap=True)
model = Model(pipeline, n_jobs=1)
# number of cores to utilize during feature extraction when training the model.
# Note: this is done by forking, not threading hence utlizes a large amount of memory.

# Write information about model before training
with open(model_directory + "/model_information.txt", 'w') as model_info:
    model_info.write("Entities: [%s]\n" % ", ".join(entities))
    model_info.write("Training Files: %i\n" %
                     len(train_dataset.get_data_files()))
    model_info.write(model_notes + "\n")
    model_info.write(str(model))

model.fit(train_dataset)

# dump fitted model
current_time = datetime.datetime.fromtimestamp(
    time.time()).strftime('%Y_%m_%d_%H.%M.%S')
model.dump(model_directory + "/tac_2018_%s_%s.pkl" %
           (model_name, current_time))

# predicts over the datasets in evaluation_dataset utilizing the model trained above, then stores those predictions
# in a given output directory
model.predict(evaluation_dataset,
              prediction_directory=os.path.join(model_directory,
                                                'predictions'))

# performs sequence stratified cross validation over the trained model.
# Note that all extracted features are stored in memory while this runs.
model.cross_validate(training_dataset=train_dataset)