Exemple #1
0
    def test_decode_errors(self):
        input_schema = dataset_schema.from_feature_spec({
            'b':
            tf.FixedLenFeature(shape=[], dtype=tf.float32),
            'a':
            tf.FixedLenFeature(shape=[], dtype=tf.string),
        })
        coder = csv_coder.CsvCoder(column_names=['a', 'b'],
                                   schema=input_schema)

        # Test bad csv.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                '\'int\' object has no attribute \'encode\': 123'):
            coder.decode(123)

        # Test extra column.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('1,2,')

        # Test missing column.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('a_value')

        # Test empty row.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('')
Exemple #2
0
    def test_missing_data(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        data = '12,,female,1,89.0,12.0'
        with self.assertRaisesRegexp(ValueError,
                                     'expected a value on column "text1"'):
            coder.decode(data)
Exemple #3
0
    def test_bad_boolean_data(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        data = '12,text value,categorical_value,1,89.0,12.0,0'
        with self.assertRaisesRegexp(ValueError,
                                     'expected "True" or "False" as inputs'):
            coder.decode(data)
Exemple #4
0
    def test_csv_coder_with_unicode(self):
        data = u'12,"this is a ,text",Hello κόσμε,1,89.0,12.0,20'

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # Python types.
        expected_decoded = {
            'category1': [u'Hello κόσμε'.encode('utf-8')],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': 'this is a ,text',
            'y': ([1], [12.0])
        }
        self._assert_encode_decode(coder, data, expected_decoded)

        # Numpy types.
        expected_decoded = {
            'category1': np.array([u'Hello κόσμε'.encode('utf-8')]),
            'numeric1': np.array(12),
            'numeric2': np.array([89.0]),
            'numeric3': np.array([20]),
            'text1': np.array(['this is a ,text']),
            'y': (np.array(1), np.array([12.0]))
        }
        self._assert_encode_decode(coder, data, expected_decoded)
Exemple #5
0
    def test_csv_coder(self):
        data = '12,"this is a ,text",categorical_value,1,89.0,12.0,20'

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # Python types.
        expected_decoded = {
            'category1': ['categorical_value'],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': 'this is a ,text',
            'y': ([1], [12.0])
        }
        self._assert_encode_decode(coder, data, expected_decoded)

        # Numpy types.
        expected_decoded = {
            'category1': np.array(['categorical_value']),
            'numeric1': np.array(12),
            'numeric2': np.array([89.0]),
            'numeric3': np.array([20]),
            'text1': np.array(['this is a ,text']),
            'y': (np.array(1), np.array([12.0]))
        }
        self._assert_encode_decode(coder, data, expected_decoded)
Exemple #6
0
    def test_missing_data(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        data = '12,,categorical_value,1,89.0,12.0,20'
        with self.assertRaisesRegexp(ValueError,
                                     'expected a value on column \'text1\''):
            coder.decode(data)
Exemple #7
0
    def test_missing_numeric_data(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # The numbers are missing.
        data = ',"this is a ,text",female,1,89.0,12.0'
        with self.assertRaisesRegexp(ValueError,
                                     'expected a value on column "numeric1"'):
            coder.decode(data)
Exemple #8
0
 def test_constructor_error(self,
                            columns,
                            feature_spec,
                            error_msg,
                            error_type=ValueError,
                            **kwargs):
   schema = dataset_schema.from_feature_spec(feature_spec)
   with self.assertRaisesRegexp(error_type, error_msg):
     csv_coder.CsvCoder(columns, schema, **kwargs)
Exemple #9
0
    def testDecode(self):
        for csv_line, value, multivalent, feature_spec in (
                self._ENCODE_DECODE_CASES + self._DECODE_ONLY_CASES):
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            if multivalent:
                coder = csv_coder.CsvCoder(columns,
                                           schema,
                                           secondary_delimiter='|',
                                           multivalent_columns=columns)
            else:
                coder = csv_coder.CsvCoder(columns, schema)

            np.testing.assert_equal(
                coder.decode(csv_line), {'x': value},
                self._msg_for_decode_case(csv_line, feature_spec))
Exemple #10
0
    def testEncode(self):
        for csv_line, value, multivalent, feature_spec in self._ENCODE_DECODE_CASES:
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            if multivalent:
                coder = csv_coder.CsvCoder(columns,
                                           schema,
                                           secondary_delimiter='|',
                                           multivalent_columns=columns)
            else:
                coder = csv_coder.CsvCoder(columns, schema)

            self.assertEqual(coder.encode({'x': value}),
                             csv_line,
                             msg=self._msg_for_encode_case(
                                 value, feature_spec))
Exemple #11
0
 def test_decode_error(self,
                       columns,
                       feature_spec,
                       csv_line,
                       error_msg,
                       error_type=ValueError,
                       **kwargs):
     schema = schema_utils.schema_from_feature_spec(feature_spec)
     coder = csv_coder.CsvCoder(columns, schema, **kwargs)
     with self.assertRaisesRegexp(error_type, error_msg):
         coder.decode(csv_line)
Exemple #12
0
 def test_encode_error(self,
                       columns,
                       feature_spec,
                       instance,
                       error_msg,
                       error_type=ValueError,
                       **kwargs):
   schema = dataset_schema.from_feature_spec(feature_spec)
   coder = csv_coder.CsvCoder(columns, schema, **kwargs)
   with self.assertRaisesRegexp(error_type, error_msg):
     coder.encode(instance)
Exemple #13
0
 def test_fixed_length_missing_values(self):
     input_schema = dataset_schema.from_feature_spec({
         'b':
         tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
         'a':
         tf.FixedLenFeature(shape=[], dtype=tf.string, default_value=''),
     })
     coder = csv_coder.CsvCoder(column_names=['a', 'b'],
                                schema=input_schema)
     self.assertEqual(coder.decode('a_value,'), {'a': 'a_value', 'b': -1.0})
     self.assertEqual(coder.decode(',1.0'), {'a': '', 'b': 1.0})
     self.assertEqual(coder.decode(','), {'a': '', 'b': -1.0})
Exemple #14
0
    def testEncodeErrors(self):
        for value, error_type, error_msg, multivalent, feature_spec in (
                self._ENCODE_ERROR_CASES):
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            with self.assertRaisesRegexp(error_type,
                                         error_msg,
                                         msg=self._msg_for_encode_case(
                                             value, feature_spec)):
                if multivalent:
                    coder = csv_coder.CsvCoder(columns,
                                               schema,
                                               secondary_delimiter='|',
                                               multivalent_columns=columns)
                else:
                    coder = csv_coder.CsvCoder(columns, schema)

                coder.encode({'x': value})
Exemple #15
0
    def test_data_types(self):
        # The numbers are strings.
        data = '"12","this is a ,text",female,"1","89.0","12.0"'

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)
        expected_decoded = {
            'category1': ['female'],
            'numeric1': 12,
            'numeric2': [89.0],
            'text1': 'this is a ,text',
            'y': ([12.0], [1])
        }
        self._assert_encode_not_equal_decode(coder, data, expected_decoded)
Exemple #16
0
    def test_fixed_length_missing_values_no_default(self):
        input_schema = dataset_schema.from_feature_spec({
            'b':
            tf.FixedLenFeature(shape=[], dtype=tf.float32),
            'a':
            tf.FixedLenFeature(shape=[], dtype=tf.string),
        })
        coder = csv_coder.CsvCoder(column_names=['a', 'b'],
                                   schema=input_schema)

        with self.assertRaisesRegexp(ValueError,
                                     'expected a value on column "b"'):
            coder.decode('a_value,')
Exemple #17
0
    def test_bad_row(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # The data has a more columns than expected.
        data = '12,"this is a ,text",female,1,89.0,12.0,"oh no, I\'m an error"'
        with self.assertRaisesRegexp(
                Exception, 'Columns do not match specified csv headers'):
            coder.decode(data)

        # The data has a fewer columns than expected.
        data = '12,"this is a ,text",female"'
        with self.assertRaisesRegexp(
                Exception, 'Columns do not match specified csv headers'):
            coder.decode(data)
Exemple #18
0
    def test_var_length_missing_values(self):
        input_schema = dataset_schema.from_feature_spec({
            'b':
            tf.VarLenFeature(dtype=tf.float32),
            'a':
            tf.VarLenFeature(dtype=tf.string),
        })
        coder = csv_coder.CsvCoder(column_names=['a', 'b'],
                                   schema=input_schema)

        self.assertEqual(coder.decode('a_value,'), {'a': ['a_value'], 'b': []})
        self.assertEqual(coder.decode(',0'), {'a': [], 'b': [0.0]})
        self.assertEqual(coder.decode(',1.0'), {'a': [], 'b': [1.0]})
        self.assertEqual(coder.decode(','), {'a': [], 'b': []})
Exemple #19
0
    def test_tsv_coder(self):
        data = '12\t"this is a \ttext"\tfemale\t1\t89.0\t12.0'

        coder = csv_coder.CsvCoder(self._COLUMNS,
                                   self._INPUT_SCHEMA,
                                   delimiter='\t')
        expected_decoded = {
            'category1': ['female'],
            'numeric1': 12,
            'numeric2': [89.0],
            'text1': 'this is a \ttext',
            'y': ([12.0], [1])
        }
        self._assert_encode_decode(coder, data, expected_decoded)
Exemple #20
0
    def testDecodeErrors(self):
        for csv_line, error_type, error_msg, multivalent, feature_spec in (
                self._DECODE_ERROR_CASES):
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            with self.assertRaisesRegexp(error_type,
                                         error_msg,
                                         msg=self._msg_for_decode_case(
                                             csv_line, feature_spec)):
                # We don't distinguish between errors in the coder constructor and in
                # the decode method.
                if multivalent:
                    coder = csv_coder.CsvCoder(columns,
                                               schema,
                                               secondary_delimiter='|',
                                               multivalent_columns=columns)
                else:
                    coder = csv_coder.CsvCoder(columns, schema)
                coder.decode(csv_line)
Exemple #21
0
 def test_picklable(self):
   csv_line = '12,"this is a ,text",categorical_value,1,89.0,12.0,20'
   instance = {
       'category1': [b'categorical_value'],
       'numeric1': 12,
       'numeric2': [89.0],
       'numeric3': [20],
       'text1': b'this is a ,text',
       'y': ([1], [12.0])
   }
   schema = dataset_schema.from_feature_spec(_FEATURE_SPEC)
   coder = csv_coder.CsvCoder(_COLUMNS, schema)
   # Repeat twice to ensure the act of encoding/decoding doesn't break
   # pickling.
   for _ in range(2):
     coder = pickle.loads(pickle.dumps(coder))
     self.assertEqual(coder.decode(csv_line), instance)
     self.assertEqual(coder.encode(instance), csv_line.encode('utf-8'))
Exemple #22
0
    def test_sparse_feature_incorrect_values(self):
        input_schema = dataset_schema.from_feature_spec({
            'a':
            tf.SparseFeature('idx', 'value', tf.float32, 10),
        })
        coder = csv_coder.CsvCoder(column_names=['idx', 'value'],
                                   schema=input_schema)

        # Index negative.
        with self.assertRaisesRegexp(ValueError, 'has index -1 out of range'):
            coder.decode('-1,12.0')

        # Index equal to size.
        with self.assertRaisesRegexp(ValueError, 'has index 10 out of range'):
            coder.decode('10,12.0')

        # Index greater than size.
        with self.assertRaisesRegexp(ValueError, 'has index 11 out of range'):
            coder.decode('11,12.0')
Exemple #23
0
 def test_valency(self):
     data = '11|12,"this is a ,text",female|male,1|3,89.0|91.0,12.0|15.0'
     feature_spec = self._INPUT_SCHEMA.as_feature_spec().copy()
     feature_spec['numeric1'] = tf.FixedLenFeature(shape=[2],
                                                   dtype=tf.int32)
     schema = dataset_schema.from_feature_spec(feature_spec)
     multivalent_columns = ['numeric1', 'numeric2', 'y']
     coder = csv_coder.CsvCoder(self._COLUMNS,
                                schema,
                                delimiter=',',
                                secondary_delimiter='|',
                                multivalent_columns=multivalent_columns)
     expected_decoded = {
         'category1': ['female|male'],
         'numeric1': [11, 12],
         'numeric2': [89.0, 91.0],
         'text1': 'this is a ,text',
         'y': ([12.0, 15.0], [1, 3])
     }
     self._assert_encode_decode(coder, data, expected_decoded)
Exemple #24
0
    def test_picklable(self):
        encoded_data = '12,"this is a ,text",female,1,89.0,12.0'

        expected_decoded = {
            'category1': ['female'],
            'numeric1': 12,
            'numeric2': [89.0],
            'text1': 'this is a ,text',
            'y': ([12.0], [1])
        }

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # Ensure we can pickle right away.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, encoded_data, expected_decoded)

        #  And after use.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, encoded_data, expected_decoded)
Exemple #25
0
    def test_sparse_feature_missing_values(self):
        input_schema = dataset_schema.from_feature_spec({
            'a':
            tf.SparseFeature('idx', 'value', tf.float32, 10),
        })
        coder = csv_coder.CsvCoder(column_names=['idx', 'value'],
                                   schema=input_schema)

        # Missing both value and index (which is allowed).
        self.assertEqual(coder.decode(','), {'a': ([], [])})

        # Missing index only (not allowed).
        with self.assertRaisesRegexp(ValueError,
                                     'expected an index in column "idx"'):
            coder.decode(',12.0')

        # Missing value only (not allowed).
        with self.assertRaisesRegexp(ValueError,
                                     'expected a value in column "value"'):
            coder.decode('1,')
Exemple #26
0
 def test_all_values_present(self):
     columns = ['a', 'b', 'c', 'd', 'e']
     input_schema = dataset_schema.from_feature_spec({
         'b':
         tf.FixedLenFeature(shape=[], dtype=tf.float32),
         'a':
         tf.FixedLenFeature(shape=[], dtype=tf.string),
         'c':
         tf.VarLenFeature(dtype=tf.string),
         'y':
         tf.SparseFeature('d', 'e', tf.float32, 10),
     })
     coder = csv_coder.CsvCoder(column_names=columns, schema=input_schema)
     self.assertEqual(
         coder.decode('a_value,1.0,0,1,12.0'),
         # Column 'c' is specified as a string so the value is not casted.
         {
             'a': 'a_value',
             'b': 1.0,
             'c': ['0'],
             'y': ([12.0], [1])
         })
Exemple #27
0
    def test_decode_errors(self):
        input_schema = dataset_schema.from_feature_spec({
            'b':
            tf.FixedLenFeature(shape=[], dtype=tf.float32),
            'a':
            tf.FixedLenFeature(shape=[], dtype=tf.string),
        })
        coder = csv_coder.CsvCoder(column_names=['a', 'b'],
                                   schema=input_schema)

        # Test non-numerical column value.
        with self.assertRaisesRegexp(
                ValueError, 'could not convert string to float: b_value'):
            coder.decode('a_value, b_value')

        # Test bad csv.
        with self.assertRaisesRegexp(csv_coder.DecodeError,
                                     'string or Unicode object, int found'):
            coder.decode(123)

        # Test extra column.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('1,2,')

        # Test missing column.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('a_value')

        # Test empty row.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('')
Exemple #28
0
 def test_column_not_found(self):
     with self.assertRaisesRegexp(ValueError, 'Column not found: '):
         csv_coder.CsvCoder([], self._INPUT_SCHEMA)
Exemple #29
0
 def test_encode(self, columns, feature_spec, csv_line, instance, **kwargs):
   schema = dataset_schema.from_feature_spec(feature_spec)
   coder = csv_coder.CsvCoder(columns, schema, **kwargs)
   self.assertEqual(coder.encode(instance), csv_line.encode('utf-8'))
Exemple #30
0
 def test_decode(self, columns, feature_spec, csv_line, instance, **kwargs):
   schema = dataset_schema.from_feature_spec(feature_spec)
   coder = csv_coder.CsvCoder(columns, schema, **kwargs)
   np.testing.assert_equal(coder.decode(csv_line), instance)