def testGetArrayEmptyPath(self):
     with self.assertRaisesRegex(KeyError,
                                 r"query_path must be non-empty.*"):
         arrow_util.get_array(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]])], ["v"]),
                              query_path=types.FeaturePath([]),
                              return_example_indices=False)
 def testGetArrayStepMissing(self):
     with self.assertRaisesRegex(KeyError,
                                 r'query_path step "ssf3" not in struct.*'):
         arrow_util.get_array(_INPUT_TABLE,
                              query_path=types.FeaturePath(
                                  ["f2", "sf2", "ssf3"]),
                              broadcast_column_name=None)
 def testGetArrayColumnMissing(self):
     with self.assertRaisesRegex(KeyError,
                                 r'query_path step 0 "x" not in table.*'):
         arrow_util.get_array(pa.Table.from_arrays([pa.array([[1], [2]])],
                                                   ["y"]),
                              query_path=types.FeaturePath(["x"]),
                              broadcast_column_name=None)
Esempio n. 4
0
 def testGetArrayStepMissing(self):
   with self.assertRaisesRegex(KeyError,
                               r'query_path step "ssf3" not in struct.*'):
     arrow_util.get_array(
         _INPUT_RECORD_BATCH,
         query_path=types.FeaturePath(["f2", "sf2", "ssf3"]),
         return_example_indices=False)
 def testGetArrayColumnMissing(self):
     with self.assertRaisesRegex(KeyError,
                                 r'query_path step 0 "x" not in table.*'):
         arrow_util.get_array(pa.Table.from_arrays([pa.array([[1], [2]])],
                                                   ["y"]),
                              query_path=types.FeaturePath(["x"]),
                              return_example_indices=False)
 def testGetArrayEmptyPath(self):
     with self.assertRaisesRegex(KeyError,
                                 r"query_path must be non-empty.*"):
         arrow_util.get_array(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([[1], [2, 2]])], ["v", "w"]),
                              query_path=types.FeaturePath([]),
                              broadcast_column_name="w")
Esempio n. 7
0
 def testGetArraySubpathMissing(self):
   with self.assertRaisesRegex(
       KeyError,
       r'Cannot process .* "sssf" inside .* list<item: int64>.*'):
     arrow_util.get_array(
         _INPUT_RECORD_BATCH,
         query_path=types.FeaturePath(["f2", "sf2", "ssf1", "sssf"]),
         return_example_indices=False)
 def testGetArraySubpathMissing(self):
     with self.assertRaisesRegex(
             KeyError,
             r'Cannot process .* "sssf" inside .* list<item: list<item: int64>>.*'
     ):
         arrow_util.get_array(_INPUT_TABLE,
                              query_path=types.FeaturePath(
                                  ["f2", "sf2", "ssf1", "sssf"]),
                              broadcast_column_name=None)
Esempio n. 9
0
 def testGetArrayReturnExampleIndices(self):
   record_batch = pa.RecordBatch.from_arrays([
       pa.array([[{
           "sf": [{
               "ssf": [1]
           }, {
               "ssf": [2]
           }]
       }], [{
           "sf": [{
               "ssf": [3, 4]
           }]
       }]]),
       pa.array([["one"], ["two"]])
   ], ["f", "w"])
   feature = types.FeaturePath(["f", "sf", "ssf"])
   actual_arr, actual_indices = arrow_util.get_array(
       record_batch, feature, return_example_indices=True)
   expected_arr = pa.array([[1], [2], [3, 4]])
   expected_indices = np.array([0, 0, 1])
   self.assertTrue(
       actual_arr.equals(expected_arr),
       "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
           feature, expected_arr, actual_arr))
   np.testing.assert_array_equal(expected_indices, actual_indices)
 def testGetArrayBroadcastString(self):
     table = pa.Table.from_arrays([
         pa.array([[{
             "sf": [
                 {
                     "ssf": [[1]]
                 },
                 {
                     "ssf": [[2]]
                 },
             ]
         }], [{
             "sf": [
                 {
                     "ssf": [[3], [4]]
                 },
             ]
         }]]),
         pa.array([["one"], ["two"]])
     ], ["f", "w"])
     feature = types.FeaturePath(["f", "sf", "ssf"])
     actual_arr, actual_weights = arrow_util.get_array(
         table, feature, broadcast_column_name="w")
     expected_arr = pa.array([[[1]], [[2]], [[3], [4]]])
     expected_weights = np.array(["one", "one", "two"])
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     np.testing.assert_array_equal(expected_weights, actual_weights)
Esempio n. 11
0
  def list_lengths(self, path: types.FeaturePath) -> np.ndarray:
    """Returns a numpy array containing the length of each feature list.

    If the requested path is not present in the table wrapped by the InputBatch,
    the returned array will consist of zeros, and be of length equal to the
    number of rows in the table.

    Args:
      path: The path for which to return list lengths.

    Returns:
      An ndarray containing the lengths of each nested list. The returned
      ndarray will be of shape (N,) where N is the number of rows in the
      referenced array (or in the table, if the path cannot be found).

    Raises:
      ValueError: When the referenced array is neither a ListArray nor null.
    """
    key = ('list_lengths({})', path)
    if key in self._cache:
      return self._cache[key]
    try:
      array, _ = arrow_util.get_array(
          self._table, path, broadcast_column_name=None)
      if pa.types.is_null(array.type):
        lengths = np.full(self._table.num_rows, 0)
      elif not pa.types.is_list(array.type):
        raise ValueError('Can only compute list lengths on list arrays, found '
                         '{}'.format(array.type))
      else:
        lengths = np.asarray(array_util.ListLengthsFromListArray(array))
    except KeyError:
      lengths = np.full(self._table.num_rows, 0)
    self._cache[key] = lengths
    return lengths
 def testGetArrayNoBroadcast(self, feature, expected):
     actual_arr, actual_weights = arrow_util.get_array(
         _INPUT_TABLE, feature, broadcast_column_name=None)
     expected_arr, _ = expected
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     self.assertIsNone(actual_weights)
 def testGetArrayNoBroadcast(self, feature, expected):
     actual_arr, actual_indices = arrow_util.get_array(
         _INPUT_TABLE, feature, return_example_indices=False)
     expected_arr, _ = expected
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     self.assertIsNone(actual_indices)
 def testGetArray(self, feature, expected):
     actual_arr, actual_indices = arrow_util.get_array(
         _INPUT_TABLE, feature, return_example_indices=True)
     expected_arr, expected_indices = expected
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     np.testing.assert_array_equal(expected_indices, actual_indices)
 def testGetArray(self, feature, expected):
     actual_arr, actual_weights = arrow_util.get_array(
         _INPUT_TABLE, feature, broadcast_column_name="w")
     expected_arr, expected_weights = expected
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     np.testing.assert_array_equal(expected_weights, actual_weights)
Esempio n. 16
0
 def testGetArray(self, feature, expected):
   actual_arr, actual_indices = arrow_util.get_array(
       _INPUT_RECORD_BATCH, feature, return_example_indices=True,
       wrap_flat_struct_in_list=False)
   expected_arr, expected_indices = expected
   self.assertTrue(
       actual_arr.equals(expected_arr),
       "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
           feature, expected_arr, actual_arr))
   np.testing.assert_array_equal(expected_indices, actual_indices)
Esempio n. 17
0
 def testGetArrayWrapFlatStructArray(self, feature, expected):
   actual_arr, actual_indices = arrow_util.get_array(
       _INPUT_RECORD_BATCH, feature, return_example_indices=True,
       wrap_flat_struct_in_list=True)
   expected_arr, expected_indices = expected
   if pa.types.is_struct(expected_arr.type):
     expected_arr = array_util.ToSingletonListArray(expected_arr)
   self.assertTrue(
       actual_arr.equals(expected_arr),
       "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
           feature, expected_arr, actual_arr))
   np.testing.assert_array_equal(expected_indices, actual_indices)
 def testGetArrayNoBroadcast(self, feature, expected):
     actual_arr, actual_indices = arrow_util.get_array(
         _INPUT_RECORD_BATCH,
         feature,
         return_example_indices=False,
         wrap_flat_struct_in_list=False)
     expected_arr, _, _ = expected
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     self.assertIsNone(actual_indices)
Esempio n. 19
0
def _get_example_value_presence(
        table: pa.Table, path: types.FeaturePath,
        boundaries: Optional[Iterable[float]]) -> Optional[pd.Series]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow table with the
  two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    table: The table in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.

  Returns:
    A Pandas Series containing distinct pairs of array values and example
    indices. The series values will be the array values, and the series index
    values will be the example indices.
  """
    arr, example_indices = arrow_util.get_array(table,
                                                path,
                                                return_example_indices=True)
    if pa.types.is_null(arr.type):
        return None

    arr_flat = arr.flatten()
    example_indices_flat = example_indices[
        array_util.GetFlattenedArrayParentIndices(arr).to_numpy()]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        df = pd.DataFrame({
            'example_indices':
            example_indices_flat[element_indices],
            'values':
            bins
        })
    else:
        df = pd.DataFrame({
            'example_indices': example_indices_flat,
            'values': np.asarray(arr_flat)
        })
    df_unique = df.drop_duplicates()
    return df_unique.set_index('example_indices')['values']
Esempio n. 20
0
  def null_mask(self, path: types.FeaturePath) -> np.ndarray:
    """Returns a boolean mask of rows which are null in the referenced array.

    If the requested path cannot be found in the table, it will be considered
    null in all rows in the table.

    Args:
      path: The path corresponding to the array from which to generate the null
        mask.
    """
    try:
      array, _ = arrow_util.get_array(
          self._table, path, broadcast_column_name=None)
      # GetArrayNullBitmapAsByteArray is only useful for non-null type arrays.
      if pa.types.is_null(array.type):
        return np.full(self._table.num_rows, True)
      return np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(array), dtype=bool)
    except KeyError:
      return np.full(self._table.num_rows, True)
Esempio n. 21
0
def _get_example_value_presence(
        record_batch: pa.RecordBatch, path: types.FeaturePath,
        boundaries: Optional[Sequence[float]],
        weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow record batch with
  the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    record_batch: The RecordBatch in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.
    weight_column_name: Optionally, a weight column to return in addition to the
      value and example index.

  Returns:
    A Pandas DataFrame containing distinct pairs of array values and example
    indices, along with the corresponding flattened example weights. The index
    will be the example indices and the values will be stored in a column named
    'values'. If weight_column_name is provided, a second column will be
    returned containing the array values, and 'weights' containing the weights
    for the example from which each value came.
  """
    arr, example_indices = arrow_util.get_array(record_batch,
                                                path,
                                                return_example_indices=True)
    if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None:
        return None

    arr_flat, parent_indices = arrow_util.flatten_nested(
        arr, return_parent_indices=True)
    is_binary_like = arrow_util.is_binary_like(arr_flat.type)
    assert boundaries is None or not is_binary_like, (
        'Boundaries can only be applied to numeric columns')
    if is_binary_like:
        # use dictionary_encode so we can use np.unique on object arrays
        dict_array = arr_flat.dictionary_encode()
        arr_flat = dict_array.indices
        arr_flat_dict = np.asarray(dict_array.dictionary)
    example_indices_flat = example_indices[parent_indices]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        rows = np.vstack([example_indices_flat[element_indices], bins])
    else:
        rows = np.vstack([example_indices_flat, np.asarray(arr_flat)])
    if not rows.size:
        return None
    # Deduplicate values which show up more than once in the same example. This
    # makes P(X=x|Y=y) in the standard lift definition behave as
    # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y.
    unique_rows = np.unique(rows, axis=1)
    example_indices = unique_rows[0, :]
    values = unique_rows[1, :]
    if is_binary_like:
        # return binary like values a pd.Categorical wrapped in a Series. This makes
        # subsqeuent operations like pd.Merge cheaper.
        values = pd.Categorical.from_codes(values, categories=arr_flat_dict)
    columns = {'example_indices': example_indices, 'values': values}
    if weight_column_name:
        weights = arrow_util.get_weight_feature(record_batch,
                                                weight_column_name)
        columns['weights'] = np.asarray(weights)[example_indices]
    df = pd.DataFrame(columns)
    return df.set_index('example_indices')