def test_default_confidences(self):
        """tests confidence scores output"""
        sample = [
            "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234.\t"
            "His\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n",
            "Hi my name is joe, \t SSN: 123456789 r@nd0m numb3rz!\n"
        ]

        # constructing default UnstructuredDataLabeler()
        default = dp.UnstructuredDataLabeler()

        # get char-level predictions/confidence scores on default model
        results = default.predict(sample,
                                  predict_options=dict(show_confidences=True))
        model_predictions_char_level, model_confidences_char_level = \
            results["pred"], results["conf"]

        # for now just checking that it's not empty and appropriate size
        num_labels = max(default.label_mapping.values()) + 1
        len_text = len(sample[0])
        self.assertIsNotNone(model_confidences_char_level)
        self.assertEqual((len_text, num_labels),
                         model_confidences_char_level[0].shape)

        len_text = len(sample[1])
        self.assertEqual((len_text, num_labels),
                         model_confidences_char_level[1].shape)
    def test_check_pipeline_overlap_mismatch(self):

        data_labeler = dp.UnstructuredDataLabeler()

        # check preprocessor model mismatch
        data_labeler.set_params({
            'preprocessor': {
                'default_label': 'a'
            },
            'model': {
                'default_label': 'b'
            }
        })
        with self.assertRaisesRegex(
                RuntimeError, 'Model and preprocessor value for '
                '`default_label` do not match. b != a'):
            data_labeler.check_pipeline(skip_postprocessor=True,
                                        error_on_mismatch=True)

        # make preprocess and model the same, but different from postprocessor
        data_labeler.set_params({
            'model': {
                'default_label': 'a'
            },
            'postprocessor': {
                'default_label': 'b'
            },
        })
        with self.assertRaisesRegex(
                RuntimeError, 'Model and postprocessor value for '
                '`default_label` do not match. '
                'a != b'):
            data_labeler.check_pipeline(skip_postprocessor=False,
                                        error_on_mismatch=True)
    def test_default_edge_cases(self):
        """more complicated test for edge cases for the default model"""
        sample = [
            "1234567890", "!@#$%&^*$)*#%)#*%-=+~.,/?{}[]|`", "\n \n \n \t \t"
        ]

        # constructing default UnstructuredDataLabeler()
        default = dp.UnstructuredDataLabeler()

        # get char-level predictions on default model
        model_predictions = default.predict(sample)
        final_results = model_predictions["pred"]

        # for now just checking that it's not empty
        self.assertIsNotNone(final_results)
    def test_default_special_cases(self):
        """
        tests for empty string (returns none) and mixed samples cases
        (throws error) w/ default labeler
        """
        # first test multiple empty strings in sample:
        sample1 = ["", "", ""]

        # constructing default UnstructuredDataLabeler()
        default = dp.UnstructuredDataLabeler()

        # get char-level predictions on default model
        output_results = default.predict(
            sample1, predict_options=dict(show_confidences=True))

        # test that we get empty list for predictions/confidences:
        expected_result = {
            'pred': [np.array([]), np.array([]),
                     np.array([])],
            'conf': [np.array([]), np.array([]),
                     np.array([])],
        }
        for expected, output in zip(expected_result['pred'],
                                    output_results['pred']):
            self.assertTrue((expected == output).all())
        for expected, output in zip(expected_result['conf'],
                                    output_results['conf']):
            self.assertTrue((expected == output).all())

        # Now we test mixed samples case:
        sample2 = ["", "abc", "\t", ""]

        expected_output = {
            'pred': [
                np.array([]),
                np.array([1.0, 1.0, 1.0]),
                np.array([1.0]),
                np.array([])
            ]
        }
        output_result = default.predict(sample2)
        for expected, output in zip(expected_output['pred'],
                                    output_result['pred']):
            self.assertTrue((expected == output).all())
    def test_default_tf_model(self):
        """simple test for new default TF model + predict()"""
        sample = [
            "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234.\t"
            "His\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n",
            "Hi my name is joe, \t SSN: 123456789 r@nd0m numb3rz!\n"
        ]

        # constructing default UnstructuredDataLabeler()
        default = dp.UnstructuredDataLabeler()

        # get char-level predictions on default model
        model_predictions = default.predict(sample)
        final_results = model_predictions["pred"]
        print(final_results)

        # for now just checking that it's not empty, previous line prints out
        # results
        self.assertIsNotNone(final_results)
    def test_set_pipeline_params(self):
        def does_dict_contains_subset(subset_dict, full_dict):
            return dict(full_dict, **subset_dict) == full_dict

        data_labeler = dp.UnstructuredDataLabeler()

        # validate preset values are not to be set values
        self.assertNotEqual(
            'a', data_labeler.preprocessor._parameters['default_label'])
        self.assertNotEqual('b',
                            data_labeler.model._parameters['default_label'])
        self.assertNotEqual(
            'c', data_labeler.postprocessor._parameters['pad_label'])

        # set parameters of pipeline components
        data_labeler.set_params({
            'preprocessor': {
                'default_label': 'a'
            },
            'model': {
                'default_label': 'b'
            },
            'postprocessor': {
                'pad_label': 'c'
            }
        })

        # preprocessor
        self.assertTrue(
            does_dict_contains_subset({'default_label': 'a'},
                                      data_labeler.preprocessor._parameters))

        # model
        self.assertTrue(
            does_dict_contains_subset({'default_label': 'b'},
                                      data_labeler.model._parameters))

        # postprocessor
        self.assertTrue(
            does_dict_contains_subset({'pad_label': 'c'},
                                      data_labeler.postprocessor._parameters))
Exemple #7
0
    def test_set_pipeline_params(self):
        def does_dict_contains_subset(subset_dict, full_dict):
            return dict(full_dict, **subset_dict) == full_dict

        data_labeler = dp.UnstructuredDataLabeler()

        # validate preset values are not to be set values
        self.assertNotEqual(
            "a", data_labeler.preprocessor._parameters["default_label"])
        self.assertNotEqual("b",
                            data_labeler.model._parameters["default_label"])
        self.assertNotEqual(
            "c", data_labeler.postprocessor._parameters["pad_label"])

        # set parameters of pipeline components
        data_labeler.set_params({
            "preprocessor": {
                "default_label": "a"
            },
            "model": {
                "default_label": "b"
            },
            "postprocessor": {
                "pad_label": "c"
            },
        })

        # preprocessor
        self.assertTrue(
            does_dict_contains_subset({"default_label": "a"},
                                      data_labeler.preprocessor._parameters))

        # model
        self.assertTrue(
            does_dict_contains_subset({"default_label": "b"},
                                      data_labeler.model._parameters))

        # postprocessor
        self.assertTrue(
            does_dict_contains_subset({"pad_label": "c"},
                                      data_labeler.postprocessor._parameters))
Exemple #8
0
    def test_check_pipeline_overlap_mismatch(self):

        data_labeler = dp.UnstructuredDataLabeler()

        # check preprocessor model mismatch
        data_labeler.set_params({
            "preprocessor": {
                "default_label": "a"
            },
            "model": {
                "default_label": "b"
            }
        })
        with self.assertRaisesRegex(
                RuntimeError,
                "Model and preprocessor value for "
                "`default_label` do not match. b != a",
        ):
            data_labeler.check_pipeline(skip_postprocessor=True,
                                        error_on_mismatch=True)

        # make preprocess and model the same, but different from postprocessor
        data_labeler.set_params({
            "model": {
                "default_label": "a"
            },
            "postprocessor": {
                "default_label": "b"
            },
        })
        with self.assertRaisesRegex(
                RuntimeError,
                "Model and postprocessor value for "
                "`default_label` do not match. "
                "a != b",
        ):
            data_labeler.check_pipeline(skip_postprocessor=False,
                                        error_on_mismatch=True)