def testGetArrayEmptyPath(self): with self.assertRaisesRegex(KeyError, r"query_path must be non-empty.*"): arrow_util.get_array(pa.Table.from_arrays( [pa.array([[1], [2, 3]])], ["v"]), query_path=types.FeaturePath([]), return_example_indices=False)
def testGetArrayStepMissing(self): with self.assertRaisesRegex(KeyError, r'query_path step "ssf3" not in struct.*'): arrow_util.get_array(_INPUT_TABLE, query_path=types.FeaturePath( ["f2", "sf2", "ssf3"]), broadcast_column_name=None)
def testGetArrayColumnMissing(self): with self.assertRaisesRegex(KeyError, r'query_path step 0 "x" not in table.*'): arrow_util.get_array(pa.Table.from_arrays([pa.array([[1], [2]])], ["y"]), query_path=types.FeaturePath(["x"]), broadcast_column_name=None)
def testGetArrayStepMissing(self): with self.assertRaisesRegex(KeyError, r'query_path step "ssf3" not in struct.*'): arrow_util.get_array( _INPUT_RECORD_BATCH, query_path=types.FeaturePath(["f2", "sf2", "ssf3"]), return_example_indices=False)
def testGetArrayColumnMissing(self): with self.assertRaisesRegex(KeyError, r'query_path step 0 "x" not in table.*'): arrow_util.get_array(pa.Table.from_arrays([pa.array([[1], [2]])], ["y"]), query_path=types.FeaturePath(["x"]), return_example_indices=False)
def testGetArrayEmptyPath(self): with self.assertRaisesRegex(KeyError, r"query_path must be non-empty.*"): arrow_util.get_array(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([[1], [2, 2]])], ["v", "w"]), query_path=types.FeaturePath([]), broadcast_column_name="w")
def testGetArraySubpathMissing(self): with self.assertRaisesRegex( KeyError, r'Cannot process .* "sssf" inside .* list<item: int64>.*'): arrow_util.get_array( _INPUT_RECORD_BATCH, query_path=types.FeaturePath(["f2", "sf2", "ssf1", "sssf"]), return_example_indices=False)
def testGetArraySubpathMissing(self): with self.assertRaisesRegex( KeyError, r'Cannot process .* "sssf" inside .* list<item: list<item: int64>>.*' ): arrow_util.get_array(_INPUT_TABLE, query_path=types.FeaturePath( ["f2", "sf2", "ssf1", "sssf"]), broadcast_column_name=None)
def testGetArrayReturnExampleIndices(self): record_batch = pa.RecordBatch.from_arrays([ pa.array([[{ "sf": [{ "ssf": [1] }, { "ssf": [2] }] }], [{ "sf": [{ "ssf": [3, 4] }] }]]), pa.array([["one"], ["two"]]) ], ["f", "w"]) feature = types.FeaturePath(["f", "sf", "ssf"]) actual_arr, actual_indices = arrow_util.get_array( record_batch, feature, return_example_indices=True) expected_arr = pa.array([[1], [2], [3, 4]]) expected_indices = np.array([0, 0, 1]) self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_indices, actual_indices)
def testGetArrayBroadcastString(self): table = pa.Table.from_arrays([ pa.array([[{ "sf": [ { "ssf": [[1]] }, { "ssf": [[2]] }, ] }], [{ "sf": [ { "ssf": [[3], [4]] }, ] }]]), pa.array([["one"], ["two"]]) ], ["f", "w"]) feature = types.FeaturePath(["f", "sf", "ssf"]) actual_arr, actual_weights = arrow_util.get_array( table, feature, broadcast_column_name="w") expected_arr = pa.array([[[1]], [[2]], [[3], [4]]]) expected_weights = np.array(["one", "one", "two"]) self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_weights, actual_weights)
def list_lengths(self, path: types.FeaturePath) -> np.ndarray: """Returns a numpy array containing the length of each feature list. If the requested path is not present in the table wrapped by the InputBatch, the returned array will consist of zeros, and be of length equal to the number of rows in the table. Args: path: The path for which to return list lengths. Returns: An ndarray containing the lengths of each nested list. The returned ndarray will be of shape (N,) where N is the number of rows in the referenced array (or in the table, if the path cannot be found). Raises: ValueError: When the referenced array is neither a ListArray nor null. """ key = ('list_lengths({})', path) if key in self._cache: return self._cache[key] try: array, _ = arrow_util.get_array( self._table, path, broadcast_column_name=None) if pa.types.is_null(array.type): lengths = np.full(self._table.num_rows, 0) elif not pa.types.is_list(array.type): raise ValueError('Can only compute list lengths on list arrays, found ' '{}'.format(array.type)) else: lengths = np.asarray(array_util.ListLengthsFromListArray(array)) except KeyError: lengths = np.full(self._table.num_rows, 0) self._cache[key] = lengths return lengths
def testGetArrayNoBroadcast(self, feature, expected): actual_arr, actual_weights = arrow_util.get_array( _INPUT_TABLE, feature, broadcast_column_name=None) expected_arr, _ = expected self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) self.assertIsNone(actual_weights)
def testGetArrayNoBroadcast(self, feature, expected): actual_arr, actual_indices = arrow_util.get_array( _INPUT_TABLE, feature, return_example_indices=False) expected_arr, _ = expected self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) self.assertIsNone(actual_indices)
def testGetArray(self, feature, expected): actual_arr, actual_indices = arrow_util.get_array( _INPUT_TABLE, feature, return_example_indices=True) expected_arr, expected_indices = expected self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_indices, actual_indices)
def testGetArray(self, feature, expected): actual_arr, actual_weights = arrow_util.get_array( _INPUT_TABLE, feature, broadcast_column_name="w") expected_arr, expected_weights = expected self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_weights, actual_weights)
def testGetArray(self, feature, expected): actual_arr, actual_indices = arrow_util.get_array( _INPUT_RECORD_BATCH, feature, return_example_indices=True, wrap_flat_struct_in_list=False) expected_arr, expected_indices = expected self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_indices, actual_indices)
def testGetArrayWrapFlatStructArray(self, feature, expected): actual_arr, actual_indices = arrow_util.get_array( _INPUT_RECORD_BATCH, feature, return_example_indices=True, wrap_flat_struct_in_list=True) expected_arr, expected_indices = expected if pa.types.is_struct(expected_arr.type): expected_arr = array_util.ToSingletonListArray(expected_arr) self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_indices, actual_indices)
def testGetArrayNoBroadcast(self, feature, expected): actual_arr, actual_indices = arrow_util.get_array( _INPUT_RECORD_BATCH, feature, return_example_indices=False, wrap_flat_struct_in_list=False) expected_arr, _, _ = expected self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) self.assertIsNone(actual_indices)
def _get_example_value_presence( table: pa.Table, path: types.FeaturePath, boundaries: Optional[Iterable[float]]) -> Optional[pd.Series]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow table with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: table: The table in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. Returns: A Pandas Series containing distinct pairs of array values and example indices. The series values will be the array values, and the series index values will be the example indices. """ arr, example_indices = arrow_util.get_array(table, path, return_example_indices=True) if pa.types.is_null(arr.type): return None arr_flat = arr.flatten() example_indices_flat = example_indices[ array_util.GetFlattenedArrayParentIndices(arr).to_numpy()] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) df = pd.DataFrame({ 'example_indices': example_indices_flat[element_indices], 'values': bins }) else: df = pd.DataFrame({ 'example_indices': example_indices_flat, 'values': np.asarray(arr_flat) }) df_unique = df.drop_duplicates() return df_unique.set_index('example_indices')['values']
def null_mask(self, path: types.FeaturePath) -> np.ndarray: """Returns a boolean mask of rows which are null in the referenced array. If the requested path cannot be found in the table, it will be considered null in all rows in the table. Args: path: The path corresponding to the array from which to generate the null mask. """ try: array, _ = arrow_util.get_array( self._table, path, broadcast_column_name=None) # GetArrayNullBitmapAsByteArray is only useful for non-null type arrays. if pa.types.is_null(array.type): return np.full(self._table.num_rows, True) return np.asarray( array_util.GetArrayNullBitmapAsByteArray(array), dtype=bool) except KeyError: return np.full(self._table.num_rows, True)
def _get_example_value_presence( record_batch: pa.RecordBatch, path: types.FeaturePath, boundaries: Optional[Sequence[float]], weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow record batch with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: record_batch: The RecordBatch in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. weight_column_name: Optionally, a weight column to return in addition to the value and example index. Returns: A Pandas DataFrame containing distinct pairs of array values and example indices, along with the corresponding flattened example weights. The index will be the example indices and the values will be stored in a column named 'values'. If weight_column_name is provided, a second column will be returned containing the array values, and 'weights' containing the weights for the example from which each value came. """ arr, example_indices = arrow_util.get_array(record_batch, path, return_example_indices=True) if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None: return None arr_flat, parent_indices = arrow_util.flatten_nested( arr, return_parent_indices=True) is_binary_like = arrow_util.is_binary_like(arr_flat.type) assert boundaries is None or not is_binary_like, ( 'Boundaries can only be applied to numeric columns') if is_binary_like: # use dictionary_encode so we can use np.unique on object arrays dict_array = arr_flat.dictionary_encode() arr_flat = dict_array.indices arr_flat_dict = np.asarray(dict_array.dictionary) example_indices_flat = example_indices[parent_indices] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) rows = np.vstack([example_indices_flat[element_indices], bins]) else: rows = np.vstack([example_indices_flat, np.asarray(arr_flat)]) if not rows.size: return None # Deduplicate values which show up more than once in the same example. This # makes P(X=x|Y=y) in the standard lift definition behave as # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y. unique_rows = np.unique(rows, axis=1) example_indices = unique_rows[0, :] values = unique_rows[1, :] if is_binary_like: # return binary like values a pd.Categorical wrapped in a Series. This makes # subsqeuent operations like pd.Merge cheaper. values = pd.Categorical.from_codes(values, categories=arr_flat_dict) columns = {'example_indices': example_indices, 'values': values} if weight_column_name: weights = arrow_util.get_weight_feature(record_batch, weight_column_name) columns['weights'] = np.asarray(weights)[example_indices] df = pd.DataFrame(columns) return df.set_index('example_indices')