Exemple #1
0
    def test_csv_decoder_with_schema(self):
        input_lines = ['1,1,2.0,hello', '5,5,12.34,world']
        column_names = [
            'int_feature_parsed_as_float', 'int_feature', 'float_feature',
            'str_feature'
        ]
        schema = text_format.Parse(
            """
        feature { name: "int_feature_parsed_as_float" type: FLOAT }
        feature { name: "int_feature" type: INT }
        feature { name: "float_feature" type: FLOAT }
        feature { name: "str_feature" type: BYTES }
        """, schema_pb2.Schema())
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.float32())),
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
            ], [
                'int_feature_parsed_as_float', 'int_feature', 'float_feature',
                'str_feature'
            ])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names,
                                              schema=schema,
                                              infer_type_from_schema=True))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #2
0
    def test_batch_examples(self):
        examples = [{
            'a': np.array([1.0, 2.0], dtype=np.float32),
            'b': np.array(['a', 'b', 'c', 'e'])
        }, {
            'a': np.array([3.0, 4.0, 5.0], dtype=np.float32),
        }, {
            'b': np.array(['d', 'e', 'f']),
            'd': np.array([10, 20, 30], dtype=np.int64),
        }, {
            'b': np.array(['a', 'b', 'c'])
        }, {
            'c': np.array(['d', 'e', 'f'])
        }]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None])
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']),
        ]

        with beam.Pipeline() as p:
            result = (
                p
                | beam.Create(examples)
                | batch_util.BatchExamplesToArrowTables(desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))
Exemple #3
0
    def test_csv_decoder_empty_csv(self):
        input_lines = []
        expected_result = []

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=[]))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #4
0
    def test_csv_decoder_invalid_row(self):
        input_lines = ['1,2.0,hello', '5,12.34']
        column_names = ['int_feature', 'float_feature', 'str_feature']

        with self.assertRaisesRegexp(
                ValueError, '.*Columns do not match specified csv headers.*'):
            with beam.Pipeline() as p:
                result = (p | beam.Create(input_lines)
                          | csv_decoder.DecodeCSV(column_names=column_names))
                util.assert_that(
                    result, test_util.make_arrow_tables_equal_fn(self, None))
 def test_decode_example_with_beam_pipeline(self, example_proto_text,
                                            decoded_table):
     example = tf.train.Example()
     text_format.Merge(example_proto_text, example)
     with beam.Pipeline() as p:
         result = (p
                   | beam.Create([example.SerializeToString()])
                   | tf_example_decoder.DecodeTFExample())
         util.assert_that(
             result,
             test_util.make_arrow_tables_equal_fn(self, [decoded_table]))
Exemple #6
0
    def test_csv_decoder_int64_max(self):
        input_lines = ['34', str(sys.maxsize)]
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[34], [sys.maxsize]], pa.list_(pa.int64())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #7
0
    def test_csv_decoder_consider_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['int_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([None, [1]], pa.list_(pa.int64())),
            ], ['int_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, skip_blank_lines=False))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #8
0
    def test_csv_decoder_negative_values(self):
        input_lines = ['-34', '45']
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[-34], [45]], pa.list_(pa.int64())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #9
0
    def test_csv_decoder_skip_blank_line(self):
        input_lines = ['', '1,2']
        column_names = ['int_feature1', 'int_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1]], pa.list_(pa.int64())),
                pa.array([[2]], pa.list_(pa.int64())),
            ], ['int_feature1', 'int_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #10
0
    def test_csv_decoder_with_int_and_float_in_same_column(self):
        input_lines = ['2,1.5', '1.5,2']
        column_names = ['float_feature1', 'float_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[2.0], [1.5]], pa.list_(pa.float32())),
                pa.array([[1.5], [2.0]], pa.list_(pa.float32())),
            ], ['float_feature1', 'float_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #11
0
    def test_csv_decoder_with_float_and_string_in_same_column(self):
        input_lines = ['2.3,abc', 'abc,2.3']
        column_names = ['str_feature1', 'str_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[b'2.3'], [b'abc']], pa.list_(pa.binary())),
                pa.array([[b'abc'], [b'2.3']], pa.list_(pa.binary())),
            ], ['str_feature1', 'str_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #12
0
    def test_csv_decoder_large_int_categorical_neg(self):
        input_lines = ['34', str(-(sys.maxsize + 2))]
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[b'34'], [str(-(sys.maxsize + 2)).encode('utf-8')]],
                         pa.list_(pa.binary())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #13
0
    def test_csv_decoder_with_tab_delimiter(self):
        input_lines = ['1\t"this is a \ttext"', '5\t']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter='\t'))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
    def test_csv_decoder_empty_row(self):
        input_lines = [',,', '1,2.0,hello']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([None, [1.0]], pa.list_(pa.float32())),
                pa.array([None, [2.0]], pa.list_(pa.float32())),
                pa.array([None, [b'hello']], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #15
0
    def test_csv_decoder_missing_values(self):
        input_lines = ['1,,hello', ',12.34,']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], None], pa.list_(pa.int64())),
                pa.array([None, [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #16
0
    def test_csv_decoder(self):
        input_lines = ['1,2.0,hello', '5,12.34,world']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #17
0
    def test_csv_decoder_with_space_delimiter(self):
        input_lines = ['1 "ab,cd,ef"', '5 "wx,xy,yz"']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'ab,cd,ef'], [b'wx,xy,yz']], pa.list_(
                    pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter=' '))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #18
0
    def test_csv_decoder_with_unicode(self):
        input_lines = [u'1,שקרכלשהו,22.34,text field']
        column_names = [
            'int_feature', 'unicode_feature', 'float_feature', 'str_feature'
        ]
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1]], pa.list_(pa.int64())),
                pa.array([[22.34]], pa.list_(pa.float32())),
                pa.array([[u'שקרכלשהו'.encode('utf-8')]], pa.list_(
                    pa.binary())),
                pa.array([[b'text field']], pa.list_(pa.binary())),
            ], [
                'int_feature', 'float_feature', 'unicode_feature',
                'str_feature'
            ])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #19
0
    def test_batch_serialized_examples(self):
        examples = [
            """
        features {
          feature {
            key: "a"
            value { float_list { value: [ 1.0, 2.0 ] } }
          }
          feature {
            key: "b"
            value { bytes_list { value: [ 'a', 'b', 'c', 'e' ] } }
          }
        }""",
            """
        features {
          feature {
            key: "a"
            value { float_list { value: [ 3.0, 4.0, 5.0 ] } }
          }
        }""",
            """
        features {
          feature {
            key: "b"
            value { bytes_list { value: [ 'd', 'e', 'f' ] } }
          }
          feature {
            key: "d"
            value { int64_list { value: [ 10, 20, 30 ] } }
          }
        }""",
            """
        features {
          feature {
            key: "b"
            value { bytes_list { value: [ 'a', 'b', 'c' ] } }
          }
        }""",
            """
        features {
          feature {
            key: "c"
            value { bytes_list { value: [ 'd', 'e', 'f' ] } }
          }
        }""",
        ]
        serialized_examples = [
            text_format.Merge(example_pbtxt,
                              tf.train.Example()).SerializeToString()
            for example_pbtxt in examples
        ]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None],
                         type=pa.list_(pa.binary()))
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']],
                         type=pa.list_(pa.binary())),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays(
                [pa.array([['d', 'e', 'f']], type=pa.list_(pa.binary()))],
                ['c']),
        ]

        with beam.Pipeline() as p:
            result = (p
                      | beam.Create(serialized_examples)
                      | batch_util.BatchSerializedExamplesToArrowTables(
                          desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))