def test_fit_predict_dump_load(self): """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts""" model = Model(self.pipeline) # Test attempting to predict before fitting with self.assertRaises(RuntimeError): model.predict('Lorem ipsum dolor sit amet.') model.fit(self.dataset, groundtruth_directory=self.groundtruth_2_directory) # Test X and y data are set self.assertTrue(model.X_data) self.assertTrue(model.y_data) # Test that there is at least one prediction resulting_ann = model.predict( 'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections' ) self.assertIsInstance(resulting_ann, Annotations) self.assertTrue(resulting_ann) # Test prediction over directory resulting_dataset = model.predict( self.dataset.data_directory, prediction_directory=self.prediction_directory) self.assertIsInstance(resulting_dataset, Dataset) self.assertEqual(len(self.dataset), len(resulting_dataset)) # Test that groundtruth is written groundtruth_dataset = Dataset(self.groundtruth_2_directory) expected = [d.file_name for d in self.dataset] actual = [d.file_name for d in groundtruth_dataset] self.assertListEqual(expected, actual) # Test that the groundtruth ann files have content for ann in groundtruth_dataset.generate_annotations(): self.assertTrue(ann) # Test pickling a model pickle_path = os.path.join(self.prediction_directory, 'test.pkl') model.dump(pickle_path) new_model = Model(self.pipeline) new_model.load(pickle_path) # Test that there is at least one prediction resulting_ann = new_model.predict( 'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections' ) self.assertIsInstance(resulting_ann, Annotations) self.assertTrue(resulting_ann)
pipeline = SystematicReviewPipeline(entities=entities, use_metamap=True) model = Model(pipeline, n_jobs=1) # number of cores to utilize during feature extraction when training the model. # Note: this is done by forking, not threading hence utlizes a large amount of memory. # Write information about model before training with open(model_directory + "/model_information.txt", 'w') as model_info: model_info.write("Entities: [%s]\n" % ", ".join(entities)) model_info.write("Training Files: %i\n" % len(train_dataset.get_data_files())) model_info.write(model_notes + "\n") model_info.write(str(model)) model.fit(train_dataset) # dump fitted model current_time = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H.%M.%S') model.dump(model_directory + "/tac_2018_%s_%s.pkl" % (model_name, current_time)) # predicts over the datasets in evaluation_dataset utilizing the model trained above, then stores those predictions # in a given output directory model.predict(evaluation_dataset, prediction_directory=os.path.join(model_directory, 'predictions')) # performs sequence stratified cross validation over the trained model. # Note that all extracted features are stored in memory while this runs. model.cross_validate(training_dataset=train_dataset)