def setUp(self): """Load data and train classifiers""" segmentation_extractor = BaselineSegmentationFeatureExtractor() segmentation_train_instances = ( segmentation_extractor.extract_features(inst) for inst in load_segmentation_instances( os.path.join(SENTENCE_SPLIT_DIR, "train.json"))) self.segmentation_dev_instances = [ segmentation_extractor.extract_features(inst) for inst in load_segmentation_instances( os.path.join(SENTENCE_SPLIT_DIR, "dev.json")) ] self.segmentation_classifier = NaiveBayesClassifier(2.0) self.segmentation_classifier.train(segmentation_train_instances)
def setUp(self) -> None: # Create instance counter and count the instances self.inst_counter = InstanceCounter() feature_extractor = SegmentationTestFeatureExtractor() self.inst_counter.count_instances( feature_extractor.extract_features(inst) for inst in load_segmentation_instances( os.path.join(SENTENCE_SPLIT_DIR, "dev.json")))
def test_tuned_segmentation(self): segmentation_extractor = TunedSegmentationFeatureExtractor() self.assertIsNotNone(segmentation_extractor.k) segmentation_train_instances = ( segmentation_extractor.extract_features(inst) for inst in load_segmentation_instances( os.path.join(SENTENCE_SPLIT_DIR, "train.json"))) self.segmentation_dev_instances = [ segmentation_extractor.extract_features(inst) for inst in load_segmentation_instances( os.path.join(SENTENCE_SPLIT_DIR, "dev.json")) ] self.segmentation_classifier = NaiveBayesClassifier( segmentation_extractor.k) self.segmentation_classifier.train(segmentation_train_instances) predicted, expected = self.segmentation_classifier.test( self.segmentation_dev_instances) acc, prec, rec, f1_score, report = classification_report( predicted, expected, "y") print( f"Tuned segmentation performance for k of {segmentation_extractor.k}:" ) print(report)
def test_sentence_split_feature_extractor(self): """Test feature extractor for sentence segmentation""" label_set = frozenset(["y", "n"]) seg_feature_extractor = BaselineSegmentationFeatureExtractor() instances = load_segmentation_instances( os.path.join(SENTENCE_SPLIT_DIR, "dev.json")) classification_instance = seg_feature_extractor.extract_features( next(instances)) self.assertEqual(ClassificationInstance, type(classification_instance)) self.assertEqual(list, type(classification_instance.features)) self.assertGreaterEqual(3, len(classification_instance.features)) self.assertEqual(str, type(classification_instance.features[0])) self.assertEqual("n", classification_instance.label) self.assertSetEqual( {"split_tok=.", "left_tok=D", "right_tok=Forrester"}, set(classification_instance.features), ) for inst in instances: classify_inst = seg_feature_extractor.extract_features(inst) self.assertIn(classify_inst.label, label_set)