def process( self, sorted_features: List[int], sparse_data: StackedAssociativeArray, set_missing_value_to_zero: bool = False, ) -> Tuple[str, List[str]]: lengths_blob = sparse_data.lengths keys_blob = sparse_data.keys values_blob = sparse_data.values MISSING_SCALAR = C2.NextBlob("MISSING_SCALAR") missing_value = 0.0 if set_missing_value_to_zero else MISSING_VALUE workspace.FeedBlob(MISSING_SCALAR, np.array([missing_value], dtype=np.float32)) C2.net().GivenTensorFill([], [MISSING_SCALAR], shape=[], values=[missing_value]) parameters: List[str] = [MISSING_SCALAR] assert len(sorted_features) > 0, "Sorted features is empty" dense_input = C2.SparseToDenseMask(keys_blob, values_blob, MISSING_SCALAR, lengths_blob, mask=sorted_features)[0] return dense_input, parameters
def normalize_sparse_matrix( self, lengths_blob: str, keys_blob: str, values_blob: str, normalization_parameters: Dict[str, NormalizationParameters], blobname_prefix: str, split_expensive_feature_groups: bool = False, ) -> Tuple[str, List[str]]: sorted_features, _ = sort_features_by_normalization( normalization_parameters) int_features = [int(feature) for feature in sorted_features] dense_input, _ = C2.SparseToDenseMask(keys_blob, values_blob, self.MISSING_SCALAR, lengths_blob, mask=int_features) return self.normalize_dense_matrix( dense_input, sorted_features, normalization_parameters, blobname_prefix, split_expensive_feature_groups, )
def sparse_to_dense(lengths_blob: str, keys_blob: str, values_blob: str, sorted_features: List[int]) -> Tuple[str, List[str]]: MISSING_SCALAR = C2.NextBlob("MISSING_SCALAR") workspace.FeedBlob(MISSING_SCALAR, np.array([MISSING_VALUE], dtype=np.float32)) C2.net().GivenTensorFill([], [MISSING_SCALAR], shape=[], values=[MISSING_VALUE]) parameters: List[str] = [MISSING_SCALAR] assert len(sorted_features) > 0, "Sorted features is empty" dense_input = C2.SparseToDenseMask(keys_blob, values_blob, MISSING_SCALAR, lengths_blob, mask=sorted_features)[0] return dense_input, parameters
def normalize_sparse_matrix( self, lengths_blob: str, keys_blob: str, values_blob: str, normalization_parameters: Dict[int, NormalizationParameters], blobname_prefix: str, split_sparse_to_dense: bool, split_expensive_feature_groups: bool, normalize: bool = True, sorted_features_override: List[int] = None, ) -> Tuple[str, List[str]]: if sorted_features_override: sorted_features = sorted_features_override else: sorted_features, _ = sort_features_by_normalization( normalization_parameters) int_features = [int(feature) for feature in sorted_features] preprocess_num_batches = 8 if split_sparse_to_dense else 1 lengths_batch = [] keys_batch = [] values_batch = [] for _ in range(preprocess_num_batches): lengths_batch.append(C2.NextBlob(blobname_prefix + "_length_batch")) keys_batch.append(C2.NextBlob(blobname_prefix + "_key_batch")) values_batch.append(C2.NextBlob(blobname_prefix + "_value_batch")) C2.net().Split([lengths_blob], lengths_batch, axis=0) total_lengths_batch = [] for x in range(preprocess_num_batches): total_lengths_batch.append( C2.Reshape(C2.ReduceBackSum(lengths_batch[x], num_reduce_dims=1), shape=[1])[0]) total_lengths_batch_concat, _ = C2.Concat(*total_lengths_batch, axis=0) C2.net().Split([keys_blob, total_lengths_batch_concat], keys_batch, axis=0) C2.net().Split([values_blob, total_lengths_batch_concat], values_batch, axis=0) dense_input_fragments = [] parameters: List[str] = [] MISSING_SCALAR = self._store_parameter( parameters, "MISSING_SCALAR", np.array([MISSING_VALUE], dtype=np.float32)) C2.net().GivenTensorFill([], [MISSING_SCALAR], shape=[], values=[MISSING_VALUE]) for preprocess_batch in range(preprocess_num_batches): dense_input_fragment = C2.SparseToDenseMask( keys_batch[preprocess_batch], values_batch[preprocess_batch], MISSING_SCALAR, lengths_batch[preprocess_batch], mask=int_features, )[0] if normalize: normalized_fragment, p = self.normalize_dense_matrix( dense_input_fragment, sorted_features, normalization_parameters, blobname_prefix, split_expensive_feature_groups, ) dense_input_fragments.append(normalized_fragment) parameters.extend(p) else: dense_input_fragments.append(dense_input_fragment) dense_input = C2.NextBlob(blobname_prefix + "_dense_input") dense_input_dims = C2.NextBlob(blobname_prefix + "_dense_input_dims") C2.net().Concat(dense_input_fragments, [dense_input, dense_input_dims], axis=0) return dense_input, parameters