Exemple #1
0
    def test_csv_decoder_with_schema(self):
        input_lines = ['1,1,2.0,hello', '5,5,12.34,world']
        column_names = [
            'int_feature_parsed_as_float', 'int_feature', 'float_feature',
            'str_feature'
        ]
        schema = text_format.Parse(
            """
        feature { name: "int_feature_parsed_as_float" type: FLOAT }
        feature { name: "int_feature" type: INT }
        feature { name: "float_feature" type: FLOAT }
        feature { name: "str_feature" type: BYTES }
        """, schema_pb2.Schema())
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.float32())),
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
            ], [
                'int_feature_parsed_as_float', 'int_feature', 'float_feature',
                'str_feature'
            ])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names,
                                              schema=schema,
                                              infer_type_from_schema=True))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #2
0
    def test_batch_examples(self):
        examples = [{
            'a': np.array([1.0, 2.0], dtype=np.float32),
            'b': np.array(['a', 'b', 'c', 'e'])
        }, {
            'a': np.array([3.0, 4.0, 5.0], dtype=np.float32),
        }, {
            'b': np.array(['d', 'e', 'f']),
            'd': np.array([10, 20, 30], dtype=np.int64),
        }, {
            'b': np.array(['a', 'b', 'c'])
        }, {
            'c': np.array(['d', 'e', 'f'])
        }]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None])
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']),
        ]

        with beam.Pipeline() as p:
            result = (
                p
                | beam.Create(examples)
                | batch_util.BatchExamplesToArrowTables(desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))
Exemple #3
0
    def test_csv_decoder_with_int_and_float_in_same_column(self):
        input_lines = ['2,1.5', '1.5,2']
        column_names = ['float_feature1', 'float_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[2.0], [1.5]], pa.list_(pa.float32())),
                pa.array([[1.5], [2.0]], pa.list_(pa.float32())),
            ], ['float_feature1', 'float_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #4
0
    def test_csv_decoder_with_float_and_string_in_same_column(self):
        input_lines = ['2.3,abc', 'abc,2.3']
        column_names = ['str_feature1', 'str_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[b'2.3'], [b'abc']], pa.list_(pa.binary())),
                pa.array([[b'abc'], [b'2.3']], pa.list_(pa.binary())),
            ], ['str_feature1', 'str_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #5
0
    def test_csv_decoder_with_tab_delimiter(self):
        input_lines = ['1\t"this is a \ttext"', '5\t']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter='\t'))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #6
0
    def test_csv_decoder_skip_blank_line(self):
        input_lines = ['', '1,2']
        column_names = ['int_feature1', 'int_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1]], pa.list_(pa.int64())),
                pa.array([[2]], pa.list_(pa.int64())),
            ], ['int_feature1', 'int_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #7
0
    def test_csv_decoder_consider_blank_line(self):
        input_lines = ['', '1,2.0']
        column_names = ['int_feature', 'float_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([None, [1]], pa.list_(pa.int64())),
                pa.array([None, [2.0]], pa.list_(pa.float32())),
            ], ['int_feature', 'float_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, skip_blank_lines=False))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
    def test_csv_decoder_empty_row(self):
        input_lines = [',,', '1,2.0,hello']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([None, [1.0]], pa.list_(pa.float32())),
                pa.array([None, [2.0]], pa.list_(pa.float32())),
                pa.array([None, [b'hello']], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
    def test_get_flattened_array_parent_indices(self):
        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([], type=pa.list_(pa.int32())))
        self.assertTrue(indices.equals(pa.array([], type=pa.int32())))

        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3.]]))
        self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
    def test_csv_decoder(self):
        input_lines = ['1,2.0,hello', '5,12.34,world']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #11
0
    def test_csv_decoder_with_space_delimiter(self):
        input_lines = ['1 "ab,cd,ef"', '5 "wx,xy,yz"']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'ab,cd,ef'], [b'wx,xy,yz']], pa.list_(
                    pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter=' '))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
  def test_flatten_list_array(self):
    flattened = arrow_util.FlattenListArray(
        pa.array([], type=pa.list_(pa.int64())))
    self.assertTrue(flattened.equals(pa.array([], type=pa.int64())))

    flattened = arrow_util.FlattenListArray(
        pa.array([[1.], [2.], [], [3.]]))
    self.assertTrue(flattened.equals(pa.array([1., 2., 3.])))
Exemple #13
0
    def test_csv_decoder_missing_values(self):
        input_lines = ['1,,hello', ',12.34,']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], None], pa.list_(pa.int64())),
                pa.array([None, [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
 def test_basic_stats_generator_invalid_value_numpy_dtype(self):
     batches = [
         pa.Table.from_arrays([pa.array([[]], type=pa.list_(pa.date32()))],
                              ['a'])
     ]
     generator = basic_stats_generator.BasicStatsGenerator()
     with self.assertRaisesRegexp(TypeError,
                                  'Feature a has unsupported arrow type'):
         self.assertCombinerOutputEqual(batches, generator, None)
Exemple #15
0
 def _process_column_infos(self, column_infos: List[csv_decoder.ColumnInfo]):
   column_handlers = []
   column_arrow_types = []
   for c in column_infos:
     if c.type == statistics_pb2.FeatureNameStatistics.INT:
       column_handlers.append(lambda v: (int(v),))
       column_arrow_types.append(pa.list_(pa.int64()))
     elif c.type == statistics_pb2.FeatureNameStatistics.FLOAT:
       column_handlers.append(lambda v: (float(v),))
       column_arrow_types.append(pa.list_(pa.float32()))
     elif c.type == statistics_pb2.FeatureNameStatistics.STRING:
       column_handlers.append(lambda v: (v,))
       column_arrow_types.append(pa.list_(pa.binary()))
     else:
       column_handlers.append(lambda _: None)
       column_arrow_types.append(pa.null())
   self._column_handlers = column_handlers
   self._column_arrow_types = column_arrow_types
   self._column_names = [c.name for c in column_infos]
 def test_list_lengths(self):
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([], type=pa.list_(pa.int64())))
   self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32())))
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([[1., 2.], [], [3.]]))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([[1., 2.], None, [3.]]))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
Exemple #17
0
 def test_topk_uniques_combiner_zero_row(self):
     batches = [
         pa.Table.from_arrays([pa.array([], type=pa.list_(pa.binary()))],
                              ['f1'])
     ]
     expected_result = {}
     generator = (top_k_uniques_combiner_stats_generator.
                  TopKUniquesCombinerStatsGenerator(
                      num_top_values=4, num_rank_histogram_buckets=3))
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Exemple #18
0
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None):
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table
    weight_column = table.column(weight_feature) if weight_feature else None
    weight_array = weight_column.data.chunk(0) if weight_column else []
    if weight_array:
        flattened_weights = arrow_util.FlattenListArray(
            weight_array).to_numpy()

    for feature_column in table.columns:
        feature_name = feature_column.name
        # Skip the weight feature.
        if feature_name == weight_feature:
            continue
        feature_path = types.FeaturePath([feature_name])
        # if it's not a categorical feature nor a string feature, we don't bother
        # with topk stats.
        if not (feature_path in categorical_features
                or feature_column.type.equals(pa.list_(pa.binary()))
                or feature_column.type.equals(pa.list_(pa.string()))):
            continue
        value_array = feature_column.data.chunk(0)
        flattened_values = arrow_util.FlattenListArray(value_array)

        if weight_array and flattened_values:
            if (pa.types.is_binary(flattened_values.type)
                    or pa.types.is_string(flattened_values.type)):
                # no free conversion.
                flattened_values_np = flattened_values.to_pandas()
            else:
                flattened_values_np = flattened_values.to_numpy()
            indices = arrow_util.GetFlattenedArrayParentIndices(value_array)
            weights_ndarray = flattened_weights[indices.to_numpy()]
            for value, count, weight in _weighted_unique(
                    flattened_values_np, weights_ndarray):
                yield (slice_key, feature_path.steps(), value), (count, weight)
        else:
            value_counts = arrow_util.ValueCounts(flattened_values)
            values = value_counts.field('values').to_pylist()
            counts = value_counts.field('counts').to_pylist()
            for value, count in six.moves.zip(values, counts):
                yield ((slice_key, feature_path.steps(), value), count)
 def test_basic_stats_generator_no_value_in_batch(self):
     batches = [
         pa.Table.from_arrays(
             [pa.array([[], [], []], type=pa.list_(pa.int64()))], ['a'])
     ]
     expected_result = {
         types.FeaturePath(['a']):
         text_format.Parse(
             """
         path {
           step: 'a'
         }
         num_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               type: QUANTILES
             }
           }
         }""", statistics_pb2.FeatureNameStatistics())
     }
     generator = basic_stats_generator.BasicStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_basic_stats_generator_only_nan(self):
     b1 = pa.Table.from_arrays(
         [pa.array([[np.NaN]], type=pa.list_(pa.float32()))], ['a'])
     batches = [b1]
     expected_result = {
         types.FeaturePath(['a']):
         text_format.Parse(
             """
         path {
           step: 'a'
         }
         type: FLOAT
         num_stats {
           common_stats {
             num_non_missing: 1
             min_num_values: 1
             max_num_values: 1
             avg_num_values: 1.0
             tot_num_values: 1
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 0.5
               }
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 0.5
               }
               type: QUANTILES
             }
           }
           histograms {
             num_nan: 1
             type: STANDARD
           }
           histograms {
             num_nan: 1
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = basic_stats_generator.BasicStatsGenerator(
         num_values_histogram_buckets=2,
         num_histogram_buckets=3,
         num_quantiles_histogram_buckets=4)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Exemple #21
0
    def test_csv_decoder_negative_values(self):
        input_lines = ['-34', '45']
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[-34], [45]], pa.list_(pa.int64())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #22
0
    def test_csv_decoder_with_unicode(self):
        input_lines = [u'1,שקרכלשהו,22.34,text field']
        column_names = [
            'int_feature', 'unicode_feature', 'float_feature', 'str_feature'
        ]
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1]], pa.list_(pa.int64())),
                pa.array([[22.34]], pa.list_(pa.float32())),
                pa.array([[u'שקרכלשהו'.encode('utf-8')]], pa.list_(
                    pa.binary())),
                pa.array([[b'text field']], pa.list_(pa.binary())),
            ], [
                'int_feature', 'float_feature', 'unicode_feature',
                'str_feature'
            ])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #23
0
    def test_csv_decoder_int64_max(self):
        input_lines = ['34', str(sys.maxsize)]
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[34], [sys.maxsize]], pa.list_(pa.int64())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Exemple #24
0
    def test_csv_decoder_large_int_categorical_neg(self):
        input_lines = ['34', str(-(sys.maxsize + 2))]
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[b'34'], [str(-(sys.maxsize + 2)).encode('utf-8')]],
                         pa.list_(pa.binary())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
 def test_basic_stats_generator_empty_batch(self):
     batches = [
         pa.Table.from_arrays([pa.array([], type=pa.list_(pa.binary()))],
                              ['a'])
     ]
     expected_result = {
         types.FeaturePath(['a']):
         text_format.Parse(
             """
         path {
           step: 'a'
         }
         type: STRING
         string_stats {
           common_stats {
             num_non_missing: 0
             tot_num_values: 0
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = basic_stats_generator.BasicStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
        testcase_name="num_parents_too_small",
        num_parents=1,
        parent_indices=np.array([1], dtype=np.int64),
        values=pa.array([1
                         ]),
        expected_error=RuntimeError,
        expected_error_regexp="Found a parent index 1 while num_parents was 1")
]

_MAKE_LIST_ARRAY_TEST_CASES = [
    dict(testcase_name="parents_are_all_empty",
         num_parents=5,
         parent_indices=np.array([], dtype=np.int64),
         values=pa.array([], type=pa.int64()),
         expected=pa.array([None, None, None, None, None],
                           type=pa.list_(pa.int64()))),
    dict(testcase_name="long_num_parent",
         num_parents=(long(1) if six.PY2 else 1),
         parent_indices=np.array([0], dtype=np.int64),
         values=pa.array([1]),
         expected=pa.array([[1]])),
    dict(
        testcase_name="leading nones",
        num_parents=3,
        parent_indices=np.array([2], dtype=np.int64),
        values=pa.array([1]),
        expected=pa.array([None, None, [1]]),
    ),
    dict(testcase_name="same_parent_and_holes",
         num_parents=4,
         parent_indices=np.array([0, 0, 0, 3, 3], dtype=np.int64),
Exemple #27
0
              value { float_list { value: [ 4.0 ] } }
            }
            feature {
              key: "float_feature_2"
              value { float_list { value: [ 5.0, 6.0 ] } }
            }
            feature {
              key: "str_feature_1"
              value { bytes_list { value: [ 'female' ] } }
            }
            feature {
              key: "str_feature_2"
              value { bytes_list { value: [ 'string', 'list' ] } }
            }
          }
        ''',
        'decoded_table':
        pa.Table.from_arrays([
            pa.array([[0]], pa.list_(pa.int64())),
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([[4.0]], pa.list_(pa.float32())),
            pa.array([[5.0, 6.0]], pa.list_(pa.float32())),
            pa.array([[b'female']], pa.list_(pa.binary())),
            pa.array([[b'string', b'list']], pa.list_(pa.binary()))
        ], [
            'int_feature_1', 'int_feature_2', 'float_feature_1',
            'float_feature_2', 'str_feature_1', 'str_feature_2'
        ])
    },
]
Exemple #28
0
 def test_topk_uniques_combiner_with_numeric_feature(self):
     # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
     batches = [
         pa.Table.from_arrays([
             pa.array([['a', 'b', 'c', 'e'], None, ['a', 'c', 'd']]),
             pa.array([[1.0, 2.0, 3.0], [4.0, 5.0], None]),
         ], ['fa', 'fb']),
         pa.Table.from_arrays([
             pa.array([['a', 'a', 'b', 'c', 'd']]),
             pa.array([None], type=pa.list_(pa.float32())),
         ], ['fa', 'fb']),
     ]
     expected_result = {
         types.FeaturePath(['fa']):
         text_format.Parse(
             """
             path {
               step: 'fa'
             }
             type: STRING
             string_stats {
               unique: 5
               top_values {
                 value: 'a'
                 frequency: 4
               }
               top_values {
                 value: 'c'
                 frequency: 3
               }
               top_values {
                 value: 'd'
                 frequency: 2
               }
               top_values {
                 value: 'b'
                 frequency: 2
               }
               rank_histogram {
                 buckets {
                   low_rank: 0
                   high_rank: 0
                   label: "a"
                   sample_count: 4.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "c"
                   sample_count: 3.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "d"
                   sample_count: 2.0
                 }
               }
           }""", statistics_pb2.FeatureNameStatistics())
     }
     generator = (top_k_uniques_combiner_stats_generator.
                  TopKUniquesCombinerStatsGenerator(
                      num_top_values=4, num_rank_histogram_buckets=3))
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Exemple #29
0
 def test_topk_uniques_combiner_with_single_bytes_feature(self):
     # 'fa': 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
     batches = [
         pa.Table.from_arrays([
             pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']],
                      type=pa.list_(pa.binary()))
         ], ['fa']),
         pa.Table.from_arrays(
             [pa.array([['a', 'b', 'c', 'd']], type=pa.list_(pa.binary()))],
             ['fa'])
     ]
     # Note that if two feature values have the same frequency, the one with the
     # lexicographically larger feature value will be higher in the order.
     expected_result = {
         types.FeaturePath(['fa']):
         text_format.Parse(
             """
     path {
       step: 'fa'
     }
     type: STRING
     string_stats {
       unique: 5
       top_values {
         value: 'a'
         frequency: 4
       }
       top_values {
         value: 'c'
         frequency: 3
       }
       top_values {
         value: 'd'
         frequency: 2
       }
       top_values {
         value: 'b'
         frequency: 2
       }
       rank_histogram {
         buckets {
           low_rank: 0
           high_rank: 0
           label: "a"
           sample_count: 4.0
         }
         buckets {
           low_rank: 1
           high_rank: 1
           label: "c"
           sample_count: 3.0
         }
         buckets {
           low_rank: 2
           high_rank: 2
           label: "d"
           sample_count: 2.0
         }
       }
   }""", statistics_pb2.FeatureNameStatistics())
     }
     generator = (top_k_uniques_combiner_stats_generator.
                  TopKUniquesCombinerStatsGenerator(
                      num_top_values=4, num_rank_histogram_buckets=3))
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Exemple #30
0
    def test_batch_serialized_examples(self):
        examples = [
            """
        features {
          feature {
            key: "a"
            value { float_list { value: [ 1.0, 2.0 ] } }
          }
          feature {
            key: "b"
            value { bytes_list { value: [ 'a', 'b', 'c', 'e' ] } }
          }
        }""",
            """
        features {
          feature {
            key: "a"
            value { float_list { value: [ 3.0, 4.0, 5.0 ] } }
          }
        }""",
            """
        features {
          feature {
            key: "b"
            value { bytes_list { value: [ 'd', 'e', 'f' ] } }
          }
          feature {
            key: "d"
            value { int64_list { value: [ 10, 20, 30 ] } }
          }
        }""",
            """
        features {
          feature {
            key: "b"
            value { bytes_list { value: [ 'a', 'b', 'c' ] } }
          }
        }""",
            """
        features {
          feature {
            key: "c"
            value { bytes_list { value: [ 'd', 'e', 'f' ] } }
          }
        }""",
        ]
        serialized_examples = [
            text_format.Merge(example_pbtxt,
                              tf.train.Example()).SerializeToString()
            for example_pbtxt in examples
        ]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None],
                         type=pa.list_(pa.binary()))
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']],
                         type=pa.list_(pa.binary())),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays(
                [pa.array([['d', 'e', 'f']], type=pa.list_(pa.binary()))],
                ['c']),
        ]

        with beam.Pipeline() as p:
            result = (p
                      | beam.Create(serialized_examples)
                      | batch_util.BatchSerializedExamplesToArrowTables(
                          desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))