Example #1
0
    def test_cross_validate(self):
        """Ensures that changes made in the package do not prevent cross_validate from running to completion"""
        model = Model(self.pipeline)

        # Test that invalid fold counts raise ValueError
        for num in [-1, 0, 1]:
            with self.assertRaises(ValueError):
                model.cross_validate(self.dataset, num)

        try:
            resulting_data = model.cross_validate(self.dataset, 2)
            # Checking the log can help verify that the results of cross validation are expectable
            logging.debug(resulting_data)
        except:
            self.assertTrue(False)
Example #2
0
    def test_cross_validate_create_groundtruth_predictions(self):
        """
        Tests that during cross validation, the medaCy groundtruth (that is, the version of the training dataset
        used by medaCy) is written as well as the predictions that are created for each fold
        """
        model = Model(self.pipeline)
        model.cross_validate(self.dataset,
                             num_folds=2,
                             prediction_directory=self.prediction_directory_3,
                             groundtruth_directory=self.groundtruth_directory)

        prediction_dataset = Dataset(self.prediction_directory_3)
        groundtruth_dataset = Dataset(self.groundtruth_directory)

        for d in [prediction_dataset, groundtruth_dataset]:
            self.assertIsInstance(d, Dataset)

        original_file_names = {d.file_name for d in self.dataset}
        prediction_file_names = {d.file_name for d in prediction_dataset}
        groundtruth_file_names = {d.file_name for d in groundtruth_dataset}

        for n in [prediction_file_names, groundtruth_file_names]:
            self.assertSetEqual(n, original_file_names)

        # Container for all Annotations in all files in all folds
        all_anns_all_folds_actual = Annotations([])

        # Test that fold groundtruth is written to file
        for fold_name in ["fold_1", "fold_2"]:
            fold_dataset = Dataset(groundtruth_dataset.data_directory /
                                   fold_name)
            for d in fold_dataset:
                fold_ann = Annotations(d.ann_path)
                groundtruth_ann = groundtruth_dataset[d.file_name]
                # Test that the entities in the fold groundtruth are a subset of the whole for that file
                self.assertTrue(set(fold_ann) <= set(groundtruth_ann))
                all_anns_all_folds_actual |= fold_ann

        # Container for all annotations pulled directly from the groundtruth dataset
        all_groundtruth_tuples = Annotations([])
        for ann in groundtruth_dataset.generate_annotations():
            all_groundtruth_tuples |= ann

        expected = set(all_groundtruth_tuples)
        actual = set(all_anns_all_folds_actual)
        self.assertSetEqual(expected, actual)
Example #3
0
    def test_cross_validate_fit_predict(self):
        """Tests that a model created with BERT can be fitted and used to predict, with and without the CRF layer"""
        pipeline = BertPipeline(entities=self.entities,
                                pretrained_model='bert-base-cased',
                                batch_size=self.batch_size,
                                cuda_device=cuda_device)

        pipeline_crf = BertPipeline(entities=self.entities,
                                    pretrained_model='bert-base-cased',
                                    batch_size=self.batch_size,
                                    cuda_device=cuda_device,
                                    using_crf=True)

        for pipe in [pipeline, pipeline_crf]:
            model = Model(pipe)
            model.cross_validate(self.dataset, 2)
            model.fit(self.dataset)
            resulting_dataset = model.predict(
                self.dataset, prediction_directory=self.prediction_directory)
            self.assertIsInstance(resulting_dataset, Dataset)
            # Test that there is at least one prediction
            if not any(resulting_dataset.generate_annotations()):
                warn("The model did not generate any predictions")
Example #4
0
pipeline = SystematicReviewPipeline(entities=entities, use_metamap=True)
model = Model(pipeline, n_jobs=1)
# number of cores to utilize during feature extraction when training the model.
# Note: this is done by forking, not threading hence utlizes a large amount of memory.

# Write information about model before training
with open(model_directory + "/model_information.txt", 'w') as model_info:
    model_info.write("Entities: [%s]\n" % ", ".join(entities))
    model_info.write("Training Files: %i\n" %
                     len(train_dataset.get_data_files()))
    model_info.write(model_notes + "\n")
    model_info.write(str(model))

model.fit(train_dataset)

# dump fitted model
current_time = datetime.datetime.fromtimestamp(
    time.time()).strftime('%Y_%m_%d_%H.%M.%S')
model.dump(model_directory + "/tac_2018_%s_%s.pkl" %
           (model_name, current_time))

# predicts over the datasets in evaluation_dataset utilizing the model trained above, then stores those predictions
# in a given output directory
model.predict(evaluation_dataset,
              prediction_directory=os.path.join(model_directory,
                                                'predictions'))

# performs sequence stratified cross validation over the trained model.
# Note that all extracted features are stored in memory while this runs.
model.cross_validate(training_dataset=train_dataset)