Ejemplo n.º 1
0
    def test_data_labeler_change_labels(self):
        data = [[
            'this is my test sentence.', {
                'entities': [(5, 7, 'ADDRESS'), (11, 20, 'INTEGER_BIG'),
                             (20, 22, 'ADDRESS'), (22, 24, 'INTEGER_BIG')]
            }
        ],
                [
                    'How nice.', {
                        'entities': [(0, 2, 'ADDRESS'), (4, 5, 'INTEGER_BIG'),
                                     (6, 8, 'INTEGER_BIG')]
                    }
                ]]
        data = pd.DataFrame(data * 50)

        # constructing default UnstructuredDataLabeler()
        default = dp.DataLabeler(labeler_type='unstructured', trainable=True)

        # get char-level predictions on default model
        model_predictions = default.fit(
            x=data[0],
            y=data[1],
            labels=['BACKGROUND', 'INTEGER_BIG', 'ADDRESS'])
        self.assertEqual(1, len(model_predictions))
        self.assertEqual(3, len(model_predictions[0]))
        self.assertIsInstance(model_predictions[0][0], dict)
        self.assertIsInstance(model_predictions[0][1], float)
        self.assertIsInstance(model_predictions[0][2], dict)

        # no bg, pad, but includes micro,macro, weighted
        self.assertEqual(5, len(model_predictions[0][2].keys()))
Ejemplo n.º 2
0
    def test_help(self, mock_stdout, *mock):
        data_labeler = dp.DataLabeler(labeler_type='structured')
        data_labeler.help()

        self.assertIn("Process Input Format", mock_stdout.getvalue())
        self.assertIn("Process Output Format:", mock_stdout.getvalue())
        self.assertIn("Parameters", mock_stdout.getvalue())
Ejemplo n.º 3
0
    def test_equality(self, mock_open, mock_load_model):
        self._setup_mock_load_model(mock_load_model)

        struct_data_labeler1 = dp.DataLabeler(labeler_type='structured')
        struct_data_labeler2 = dp.DataLabeler(labeler_type='structured')
        unstruct_data_labeler1 = dp.DataLabeler(labeler_type='unstructured')
        unstruct_data_labeler2 = dp.DataLabeler(labeler_type='unstructured')

        # Assert they are equal
        self.assertEqual(struct_data_labeler1, struct_data_labeler2)
        self.assertEqual(unstruct_data_labeler1, unstruct_data_labeler2)

        # Assert they are unequal because of type
        self.assertNotEqual(struct_data_labeler1, unstruct_data_labeler1)

        # Assert they are unequal because of _label_encoding
        struct_data_labeler1.set_labels(['BACKGROUND', 'b', 'c'])
        self.assertNotEqual(struct_data_labeler1, struct_data_labeler2)
Ejemplo n.º 4
0
    def test_multi_labelers(self, *mocks):
        """
        Test Multiple labelers called consecutively.
        :return:
        """
        data = dp.Data(data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str),
                       data_type='parquet')
        data2 = dp.Data(data=pd.DataFrame(['atest', 'b', 'c']), data_type='csv')

        structured_labeler_1 = dp.DataLabeler(labeler_type='structured')
        structured_labeler_1.predict(data)
        unstructured_labeler = dp.DataLabeler(labeler_type='unstructured')
        unstructured_labeler._label_encoding = {
            'PAD': 0,
            'CITY': 1,  # SAME AS BACKGROUND
            'BACKGROUND': 1,
            'ADDRESS': 2,
            'BAN': 3,
            'CREDIT_CARD': 4,
            'EMAIL_ADDRESS': 5,
            'UUID': 6,
            'HASH_OR_KEY': 7,
            'IPV4': 8,
            'IPV6': 9,
            'MAC_ADDRESS': 10,
            'NAME': 11,  # SAME AS PERSON
            'PERSON': 11,
            'PHONE_NUMBER': 12,
            'SSN': 13,
            'URL': 14,
            'DATETIME': 15,
            'INTEGER_BIG': 16,  # SAME AS INTEGER
            'INTEGER': 16,
            'FLOAT': 17,
            'QUANTITY': 18,
            'ORDINAL': 19
        }

        unstructured_labeler.predict(data)
        structured_labeler_2 = dp.DataLabeler(labeler_type='structured')
        structured_labeler_2.predict(data2)
Ejemplo n.º 5
0
    def test_load_data_labeler(self, *mocks):

        # error if no labeler specified
        with self.assertRaisesRegex(TypeError,
                                    r'__new__\(\) missing 1 required positional'
                                    r' argument: \'labeler_type\''):
            data_labeler = dp.DataLabeler()

        # error if no labeler specified
        with self.assertRaisesRegex(ValueError,
                                    r'No DataLabeler class types matched the '
                                    r'input, `fake_labeler`. Allowed types '
                                    r'\[\'structured\', \'unstructured\'\].'):
            data_labeler = dp.DataLabeler(labeler_type='fake_labeler')

        # test loads a structured data labeler
        data_labeler = dp.DataLabeler(labeler_type='structured')
        self.assertIsInstance(data_labeler, BaseDataLabeler)

        # test loads an unstructured data labeler
        data_labeler = dp.DataLabeler(labeler_type='unstructured')
        self.assertIsInstance(data_labeler, BaseDataLabeler)
Ejemplo n.º 6
0
    def test_load_labeler(self, mock_open, mock_load_model):

        # TODO: Mock exist functions

        self._setup_mock_load_model(mock_load_model)

        # load default
        data_labeler = dp.DataLabeler(labeler_type='structured')

        self.assertDictEqual(label_encoding['encoder'],
                             struct_data_labeler_parameters['label_mapping'])
        self.assertIsInstance(data_labeler.preprocessor,
                              data_processing.BaseDataPreprocessor)
        self.assertIsInstance(data_labeler.postprocessor,
                              data_processing.BaseDataPostprocessor)
Ejemplo n.º 7
0
    def test_set_labels(self, mock_open, mock_load_model):

        # setup
        self._setup_mock_load_model(mock_load_model)
        data_labeler = dp.DataLabeler(labeler_type='structured')
        data_labeler._model = mock.Mock()
        data_labeler._model.set_label_mapping.return_value = None

        # test require pad true
        data_labeler._model.requires_zero_mapping = True
        data_labeler.set_labels(['a', 'b'])
        data_labeler._model.set_label_mapping.assert_called_with(
            label_mapping={'a': 1, 'b': 2})

        # test require pad false
        data_labeler._model.requires_zero_mapping = False
        data_labeler.set_labels(['a', 'b'])
        data_labeler._model.set_label_mapping.assert_called_with(
            label_mapping={'a': 0, 'b': 1})
Ejemplo n.º 8
0
    def test_fit_with_default_model(self):
        data = [[
            'this is my test sentence.', {
                'entities': [(5, 7, 'ADDRESS'), (11, 20, 'INTEGER_BIG'),
                             (20, 22, 'ADDRESS'), (22, 24, 'INTEGER_BIG')]
            }
        ],
                [
                    'How nice.', {
                        'entities': [(0, 2, 'ADDRESS'), (4, 5, 'INTEGER_BIG'),
                                     (6, 8, 'INTEGER_BIG')]
                    }
                ]]
        data = pd.DataFrame(data * 50)

        # constructing default UnstructuredDataLabeler()
        default = dp.DataLabeler(labeler_type='unstructured', trainable=True)

        # get char-level predictions on default model
        model_predictions = default.fit(x=data[0], y=data[1])
        self.assertEqual(1, len(model_predictions))
        self.assertEqual(3, len(model_predictions[0]))
        self.assertIsInstance(model_predictions[0][0], dict)
        self.assertIsInstance(model_predictions[0][1], float)
        self.assertIsInstance(model_predictions[0][2], dict)

        # no bg, pad, but includes micro,macro, weighted
        self.assertEqual(21, len(model_predictions[0][2].keys()))

        # test default no validation
        model_predictions = default.fit(x=data[0],
                                        y=data[1],
                                        validation_split=0)
        self.assertEqual(1, len(model_predictions))
        self.assertEqual(3, len(model_predictions[0]))
        self.assertIsInstance(model_predictions[0][0], dict)
        self.assertIsNone(model_predictions[0][1])  # no f1 since no validation
        self.assertListEqual(model_predictions[0][2], [])

        # validate epoch id
        self.assertEqual(2, default.model._epoch_id)