def testIsBinaryLike(self): for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()): self.assertTrue(arrow_util.is_binary_like(t)) for t in (pa.list_(pa.binary()), pa.large_list(pa.string())): self.assertFalse(arrow_util.is_binary_like(t))
def DecodedExamplesToRecordBatch( decoded_examples: List[types.Example]) -> pa.RecordBatch: """Converts a list of types.Example to an Arrow RecordBatch. where types.Example is Dict[Union[bytes, unicode], Union[None, np.ndarray]] The result record batch has M rows and N columns where M is the number of examples in the list and N is the number of unique features in the examples. Each column is either a ListArray<primitive|string|binary> or a NullArray. None and missing feature handling: - if a feature's value is None in an example, then its corresponding column in the result batch will have a null at the corresponding position. - if a feature's value is always None across all the examples in the input list, then its corresponding column in the result batch will be a NullArray. - if an example does not contain a feature (in the universe of features), then the column of that feature will have a null at the corresponding position. Args: decoded_examples: a Dict[Union[bytes, unicode], Union[None, np.ndarray]] Returns: a pa.RecordBatch. Raises: ValueError: when the conversion fails. TypeError: when some of the output columns are not of supported types. """ if not decoded_examples: return pa.RecordBatch.from_arrays([], []) struct_array = pa.array(decoded_examples) if not pa.types.is_struct(struct_array.type): raise ValueError("Unexpected Arrow type created from input") field_names = [f.name for f in list(struct_array.type)] if not field_names: return _GetEmptyRecordBatch(len(decoded_examples)) value_arrays = struct_array.flatten() for name, array in six.moves.zip(field_names, value_arrays): if pa.types.is_null(array.type): continue if not arrow_util.is_list_like(array.type): raise TypeError( "Expected list arrays for field {} but got {}".format( name, array.type)) value_type = array.type.value_type if (not pa.types.is_integer(value_type) and not pa.types.is_floating(value_type) and not arrow_util.is_binary_like(value_type) and not pa.types.is_null(value_type)): raise TypeError("Type not supported: {} {}".format( name, array.type)) return pa.RecordBatch.from_arrays(value_arrays, field_names)
def update(self, feature_array: pa.Array) -> None: """Update the partial string statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats. flattened_values_array, _ = arrow_util.flatten_nested(feature_array) if arrow_util.is_binary_like(flattened_values_array.type): # GetBinaryArrayTotalByteSize returns a Python long (to be compatible # with Python3). To make sure we do cheaper integer arithemetics in # Python2, we first convert it to int. self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize( flattened_values_array)) elif flattened_values_array: # We can only do flattened_values_array.to_numpy() when it's not empty. # This could be computed faster by taking log10 of the integer. def _len_after_conv(s): return len(str(s)) self.total_bytes_length += np.sum( np.vectorize(_len_after_conv, otypes=[np.int32])(np.asarray(flattened_values_array)))
def get_feature_type_from_arrow_type( feature_path: types.FeaturePath, arrow_type: pa.DataType) -> Optional[types.FeatureNameStatisticsType]: """Get feature type from Arrow type. Args: feature_path: path of the feature. arrow_type: Arrow DataType. Returns: A statistics_pb2.FeatureNameStatistics.Type value or None if arrow_type is null (which means it cannot be determined for now). Raises: TypeError: if the type is not supported. """ if pa.types.is_null(arrow_type): return None if not arrow_util.is_list_like(arrow_type): raise TypeError( 'Expected feature column to be a ' '(Large)List<primitive|struct> or null, but feature {} ' 'was {}.'.format(feature_path, arrow_type)) value_type = arrow_util.get_innermost_nested_type(arrow_type) if pa.types.is_integer(value_type): return statistics_pb2.FeatureNameStatistics.INT elif pa.types.is_floating(value_type): return statistics_pb2.FeatureNameStatistics.FLOAT elif arrow_util.is_binary_like(value_type): return statistics_pb2.FeatureNameStatistics.STRING elif pa.types.is_struct(value_type): return statistics_pb2.FeatureNameStatistics.STRUCT elif pa.types.is_null(value_type): return None raise TypeError('Feature {} has unsupported arrow type: {}'.format( feature_path, arrow_type))
def _get_example_value_presence( record_batch: pa.RecordBatch, path: types.FeaturePath, boundaries: Optional[Sequence[float]], weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow record batch with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: record_batch: The RecordBatch in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. weight_column_name: Optionally, a weight column to return in addition to the value and example index. Returns: A Pandas DataFrame containing distinct pairs of array values and example indices, along with the corresponding flattened example weights. The index will be the example indices and the values will be stored in a column named 'values'. If weight_column_name is provided, a second column will be returned containing the array values, and 'weights' containing the weights for the example from which each value came. """ arr, example_indices = arrow_util.get_array(record_batch, path, return_example_indices=True) if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None: return None arr_flat, parent_indices = arrow_util.flatten_nested( arr, return_parent_indices=True) is_binary_like = arrow_util.is_binary_like(arr_flat.type) assert boundaries is None or not is_binary_like, ( 'Boundaries can only be applied to numeric columns') if is_binary_like: # use dictionary_encode so we can use np.unique on object arrays dict_array = arr_flat.dictionary_encode() arr_flat = dict_array.indices arr_flat_dict = np.asarray(dict_array.dictionary) example_indices_flat = example_indices[parent_indices] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) rows = np.vstack([example_indices_flat[element_indices], bins]) else: rows = np.vstack([example_indices_flat, np.asarray(arr_flat)]) if not rows.size: return None # Deduplicate values which show up more than once in the same example. This # makes P(X=x|Y=y) in the standard lift definition behave as # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y. unique_rows = np.unique(rows, axis=1) example_indices = unique_rows[0, :] values = unique_rows[1, :] if is_binary_like: # return binary like values a pd.Categorical wrapped in a Series. This makes # subsqeuent operations like pd.Merge cheaper. values = pd.Categorical.from_codes(values, categories=arr_flat_dict) columns = {'example_indices': example_indices, 'values': values} if weight_column_name: weights = arrow_util.get_weight_feature(record_batch, weight_column_name) columns['weights'] = np.asarray(weights)[example_indices] df = pd.DataFrame(columns) return df.set_index('example_indices')