def testGetWeightFeatureNullArray(self): with self.assertRaisesRegex(ValueError, 'Weight column "w" cannot be ' r'null\.'): arrow_util.get_weight_feature(pa.Table.from_arrays( [pa.array([[1], [2]]), pa.array([None, None])], ["v", "w"]), weight_column="w")
def testGetWeightFeatureNotFound(self): with self.assertRaisesRegex( ValueError, r'Weight column "w" not present in the input table\.'): arrow_util.get_weight_feature(pa.Table.from_arrays( [pa.array([[1], [2]]), pa.array([[1], [3]])], ["u", "v"]), weight_column="w")
def testGetWeightFeatureTooManyValues(self): with self.assertRaisesRegex( ValueError, r'Weight column "w" must have exactly one value in each example\.'): arrow_util.get_weight_feature( pa.RecordBatch.from_arrays( [pa.array([[1], [2, 3]]), pa.array([[1], [2, 2]])], ["v", "w"]), weight_column="w")
def testGetWeightFeatureMissingValue(self): with self.assertRaisesRegex( ValueError, r'Weight column "w" must have exactly one value in each example\.' ): arrow_util.get_weight_feature(pa.Table.from_arrays( [pa.array([[1], [2]]), pa.array([[1], []])], ["v", "w"]), weight_column="w")
def _get_example_value_presence( record_batch: pa.RecordBatch, path: types.FeaturePath, boundaries: Optional[Sequence[float]], weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow record batch with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: record_batch: The RecordBatch in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. weight_column_name: Optionally, a weight column to return in addition to the value and example index. Returns: A Pandas DataFrame containing distinct pairs of array values and example indices, along with the corresponding flattened example weights. The index will be the example indices and the values will be stored in a column named 'values'. If weight_column_name is provided, a second column will be returned containing the array values, and 'weights' containing the weights for the example from which each value came. """ arr, example_indices = arrow_util.get_array(record_batch, path, return_example_indices=True) if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None: return None arr_flat, parent_indices = arrow_util.flatten_nested( arr, return_parent_indices=True) is_binary_like = arrow_util.is_binary_like(arr_flat.type) assert boundaries is None or not is_binary_like, ( 'Boundaries can only be applied to numeric columns') if is_binary_like: # use dictionary_encode so we can use np.unique on object arrays dict_array = arr_flat.dictionary_encode() arr_flat = dict_array.indices arr_flat_dict = np.asarray(dict_array.dictionary) example_indices_flat = example_indices[parent_indices] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) rows = np.vstack([example_indices_flat[element_indices], bins]) else: rows = np.vstack([example_indices_flat, np.asarray(arr_flat)]) if not rows.size: return None # Deduplicate values which show up more than once in the same example. This # makes P(X=x|Y=y) in the standard lift definition behave as # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y. unique_rows = np.unique(rows, axis=1) example_indices = unique_rows[0, :] values = unique_rows[1, :] if is_binary_like: # return binary like values a pd.Categorical wrapped in a Series. This makes # subsqeuent operations like pd.Merge cheaper. values = pd.Categorical.from_codes(values, categories=arr_flat_dict) columns = {'example_indices': example_indices, 'values': values} if weight_column_name: weights = arrow_util.get_weight_feature(record_batch, weight_column_name) columns['weights'] = np.asarray(weights)[example_indices] df = pd.DataFrame(columns) return df.set_index('example_indices')