def test_get_flattened_array_parent_indices(self):
        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([], type=pa.list_(pa.int32())))
        self.assertTrue(indices.equals(pa.array([], type=pa.int32())))

        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3.]]))
        self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
 def test_list_lengths(self):
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([], type=pa.list_(pa.int64())))
   self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32())))
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([[1., 2.], [], [3.]]))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([[1., 2.], None, [3.]]))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
  def test_get_array_null_bitmap_as_byte_array(self):
    array = pa.array([], type=pa.int32())
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([], type=pa.uint8())))

    array = pa.array([1, 2, None, 3, None], type=pa.int32())
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(
        null_masks.equals(pa.array([0, 0, 1, 0, 1], type=pa.uint8())))

    array = pa.array([1, 2, 3])
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([0, 0, 0], type=pa.uint8())))

    array = pa.array([None, None, None], type=pa.int32())
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([1, 1, 1], type=pa.uint8())))
    # Demonstrate that the returned array can be converted to a numpy boolean
    # array w/o copying
    np.testing.assert_equal(
        np.array([True, True, True]), null_masks.to_numpy().view(np.bool))
Beispiel #4
0
          {
              "int64_feature": np.array([4], dtype=np.int64),
              "int32_feature": np.array([4], dtype=np.int32),
              "float_feature": np.array([2., 3., 4.], dtype=np.float32),
              "double_feature": np.array([2., 3., 4.], dtype=np.float64),
              "bytes_feature": np.array([b"ghi"], dtype=np.object),
              "unicode_feature": np.array([u"ghi"], dtype=np.object),
          },
      ],
      expected_output={
          "int64_feature":
          pa.array([[1, 2, 3], [4]], type=pa.list_(pa.int64())),
          "uint64_feature":
          pa.array([[1, 2, 3], None], type=pa.list_(pa.uint64())),
          "int32_feature":
          pa.array([[1, 2, 3], [4]], type=pa.list_(pa.int32())),
          "uint32_feature":
          pa.array([[1, 2, 3], None], type=pa.list_(pa.uint32())),
          "float_feature":
          pa.array([[1.], [2., 3., 4.]], type=pa.list_(pa.float32())),
          "double_feature":
          pa.array([[1.], [2., 3., 4.]], type=pa.list_(pa.float64())),
          "bytes_feature":
          pa.array([[b"abc", b"def"], [b"ghi"]],
                   type=pa.list_(pa.binary())),
          "unicode_feature":
          pa.array([[b"abc", b"def"], [b"ghi"]],
                   type=pa.list_(pa.string())),
      }),
 dict(testcase_name="mixed_unicode_and_bytes",
      input_examples=[
Beispiel #5
0
    def test_stats_pipeline_with_examples_with_no_values(self):
        tables = [
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w'])
        ]

        expected_result = text_format.Parse(
            """
      datasets{
        num_examples: 3
        features {
          path {
            step: 'a'
          }
          type: FLOAT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'b'
          }
          type: STRING
          string_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'c'
          }
          type: INT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatisticsList())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=1,
                num_rank_histogram_buckets=1,
                num_values_histogram_buckets=2,
                num_histogram_buckets=1,
                num_quantiles_histogram_buckets=1,
                epsilon=0.001)
            result = (p | beam.Create(tables)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))