def _training_examples_and_variables(): """Returns dictionaries for training examples and variables.""" batch_size = targets.get_shape()[0] # Iterate over all feature columns and create appropriate lists for dense # and sparse features as well as dense and sparse weights (variables) for # SDCA. # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables # dict as 1-dimensional tensors. dense_features, sparse_features, sparse_feature_with_values = [], [], [] dense_feature_weights = [] sparse_feature_weights, sparse_feature_with_values_weights = [], [] for column in sorted(columns_to_variables.keys(), key=lambda x: x.key): transformed_tensor = features[column] if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access # A real-valued column corresponds to a dense feature in SDCA. A # transformed tensor corresponding to a RealValuedColumn has rank 2 # (its shape is typically [batch_size, column.dimension]) and so it # can be passed to SDCA as is. dense_features.append(transformed_tensor) # For real valued columns, the variables list contains exactly one # element. dense_feature_weights.append(columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access # A bucketized column corresponds to a sparse feature in SDCA. The # bucketized feature is "sparsified" for SDCA by converting it to a # SparseFeatureColumn respresenting the one-hot encoding of the # bucketized feature. # # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a # bucketized feature column to a dense feature in SDCA. This will likely # depend on the number of buckets. dense_bucket_tensor = column._to_dnn_input_layer( transformed_tensor) # pylint: disable=protected-access sparse_feature_column = _dense_tensor_to_sparse_feature_column( dense_bucket_tensor) sparse_feature_with_values.append(sparse_feature_column) # For bucketized columns, the variables list contains exactly one # element. sparse_feature_with_values_weights.append( columns_to_variables[column][0]) elif isinstance( column, ( layers.feature_column._CrossedColumn, # pylint: disable=protected-access layers.feature_column._SparseColumn)): # pylint: disable=protected-access sparse_features.append( SparseFeatureColumn( array_ops.reshape( array_ops.split(value=transformed_tensor.indices, num_or_size_splits=2, axis=1)[0], [-1]), array_ops.reshape(transformed_tensor.values, [-1]), None)) sparse_feature_weights.append(columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access id_tensor = column.id_tensor(transformed_tensor) weight_tensor = column.weight_tensor(transformed_tensor) sparse_feature_with_values.append( SparseFeatureColumn( array_ops.reshape( array_ops.split(value=id_tensor.indices, num_or_size_splits=2, axis=1)[0], [-1]), array_ops.reshape(id_tensor.values, [-1]), array_ops.reshape(weight_tensor.values, [-1]))) sparse_feature_with_values_weights.append( columns_to_variables[column][0]) else: raise ValueError( "SDCAOptimizer does not support column type {}".format( type(column).__name__)) example_weights = array_ops.reshape( features[weight_column_name], shape=[-1]) if weight_column_name else array_ops.ones([batch_size]) example_ids = features[optimizer.example_id_column] sparse_feature_with_values.extend(sparse_features) sparse_feature_with_values_weights.extend(sparse_feature_weights) examples = dict(sparse_features=sparse_feature_with_values, dense_features=dense_features, example_labels=math_ops.to_float( array_ops.reshape(targets, shape=[-1])), example_weights=example_weights, example_ids=example_ids) sdca_variables = dict( sparse_features_weights=sparse_feature_with_values_weights, dense_features_weights=dense_feature_weights) return examples, sdca_variables
def _training_examples_and_variables(): """Returns dictionaries for training examples and variables.""" batch_size = targets.get_shape()[0] # Iterate over all feature columns and create appropriate lists for dense # and sparse features as well as dense and sparse weights (variables) for # SDCA. # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables # dict as 1-dimensional tensors. dense_features, sparse_features, sparse_feature_with_values = [], [], [] dense_feature_weights = [] sparse_feature_weights, sparse_feature_with_values_weights = [], [] for column in sorted(columns_to_variables.keys(), key=lambda x: x.key): transformed_tensor = features[column] if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access # A real-valued column corresponds to a dense feature in SDCA. A # transformed tensor corresponding to a RealValuedColumn should have # rank at most 2. In order to be passed to SDCA, its rank needs to be # exactly 2 (i.e., its shape should be [batch_size, column.dim]). check_rank_op = control_flow_ops.Assert( math_ops.less_equal(array_ops.rank(transformed_tensor), 2), ['transformed_tensor should have rank at most 2.']) # Reshape to [batch_size, dense_column_dimension]. with ops.control_dependencies([check_rank_op]): transformed_tensor = array_ops.reshape( transformed_tensor, [array_ops.shape(transformed_tensor)[0], -1]) dense_features.append(transformed_tensor) # For real valued columns, the variables list contains exactly one # element. dense_feature_weights.append( columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access # A bucketized column corresponds to a sparse feature in SDCA. The # bucketized feature is "sparsified" for SDCA by converting it to a # SparseFeatureColumn representing the one-hot encoding of the # bucketized feature. # # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a # bucketized feature column to a dense feature in SDCA. This will # likely depend on the number of buckets. dense_bucket_tensor = column._to_dnn_input_layer( transformed_tensor) # pylint: disable=protected-access sparse_feature_column = _dense_tensor_to_sparse_feature_column( dense_bucket_tensor) sparse_feature_with_values.append(sparse_feature_column) # If a partitioner was used during variable creation, we will have a # list of Variables here larger than 1. vars_to_append = columns_to_variables[column][0] if len(columns_to_variables[column]) > 1: vars_to_append = columns_to_variables[column] sparse_feature_with_values_weights.append(vars_to_append) elif isinstance( column, ( layers.feature_column._WeightedSparseColumn, # pylint: disable=protected-access layers.feature_column._CrossedColumn, # pylint: disable=protected-access layers.feature_column._SparseColumn)): # pylint: disable=protected-access if isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access id_tensor = column.id_tensor(transformed_tensor) weight_tensor = array_ops.reshape( column.weight_tensor(transformed_tensor).values, [-1]) else: id_tensor = transformed_tensor weight_tensor = array_ops.ones( [array_ops.shape(id_tensor.indices)[0]], dtypes.float32) example_ids = array_ops.reshape(id_tensor.indices[:, 0], [-1]) flat_ids = array_ops.reshape(id_tensor.values, [-1]) # Prune invalid IDs (< 0) from the flat_ids, example_ids, and # weight_tensor. These can come from looking up an OOV entry in the # vocabulary (default value being -1). is_id_valid = math_ops.greater_equal(flat_ids, 0) flat_ids = array_ops.boolean_mask(flat_ids, is_id_valid) example_ids = array_ops.boolean_mask( example_ids, is_id_valid) weight_tensor = array_ops.boolean_mask( weight_tensor, is_id_valid) projection_length = math_ops.reduce_max(flat_ids) + 1 # project ids based on example ids so that we can dedup ids that # occur multiple times for a single example. projected_ids = projection_length * example_ids + flat_ids # Remove any redundant ids. ids, idx = array_ops.unique(projected_ids) # Keep only one example id per duplicated ids. example_ids_filtered = math_ops.unsorted_segment_min( example_ids, idx, array_ops.shape(ids)[0]) # reproject ids back feature id space. reproject_ids = (ids - projection_length * example_ids_filtered) weights = array_ops.reshape( math_ops.unsorted_segment_sum(weight_tensor, idx, array_ops.shape(ids)[0]), [-1]) sparse_feature_with_values.append( SparseFeatureColumn(example_ids_filtered, reproject_ids, weights)) # If a partitioner was used during variable creation, we will have a # list of Variables here larger than 1. vars_to_append = columns_to_variables[column][0] if len(columns_to_variables[column]) > 1: vars_to_append = columns_to_variables[column] sparse_feature_with_values_weights.append(vars_to_append) else: raise ValueError( 'SDCAOptimizer does not support column type %s.' % type(column).__name__) example_weights = array_ops.reshape( features[weight_column_name], shape=[ -1 ]) if weight_column_name else array_ops.ones([batch_size]) example_ids = features[self._example_id_column] sparse_feature_with_values.extend(sparse_features) sparse_feature_with_values_weights.extend(sparse_feature_weights) examples = dict(sparse_features=sparse_feature_with_values, dense_features=dense_features, example_labels=math_ops.cast( array_ops.reshape(targets, shape=[-1]), dtypes.float32), example_weights=example_weights, example_ids=example_ids) sdca_variables = dict( sparse_features_weights=sparse_feature_with_values_weights, dense_features_weights=dense_feature_weights) return examples, sdca_variables
def _training_examples_and_variables(): """Returns dictionaries for training examples and variables.""" batch_size = targets.get_shape()[0] # Iterate over all feature columns and create appropriate lists for dense # and sparse features as well as dense and sparse weights (variables) for # SDCA. # TODO (sibyl-vie3Poto): Reshape variables stored as values in column_to_variables id:814 gh:815 # dict as 1-dimensional tensors. dense_features, sparse_features, sparse_feature_with_values = [], [], [] dense_feature_weights = [] sparse_feature_weights, sparse_feature_with_values_weights = [], [] for column in sorted(columns_to_variables.keys(), key=lambda x: x.key): transformed_tensor = features[column] if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access # A real-valued column corresponds to a dense feature in SDCA. A # transformed tensor corresponding to a RealValuedColumn should have # rank at most 2. In order to be passed to SDCA, its rank needs to be # exactly 2 (i.e., its shape should be [batch_size, column.dim]). check_rank_op = control_flow_ops.Assert( math_ops.less_equal(array_ops.rank(transformed_tensor), 2), ['transformed_tensor shouls have rank at most 2.']) # Reshape to [batch_size, dense_column_dimension]. with ops.control_dependencies([check_rank_op]): transformed_tensor = array_ops.reshape( transformed_tensor, [array_ops.shape(transformed_tensor)[0], -1]) dense_features.append(transformed_tensor) # For real valued columns, the variables list contains exactly one # element. dense_feature_weights.append( columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access # A bucketized column corresponds to a sparse feature in SDCA. The # bucketized feature is "sparsified" for SDCA by converting it to a # SparseFeatureColumn respresenting the one-hot encoding of the # bucketized feature. # # TODO (sibyl-vie3Poto): Explore whether it is more efficient to translate a id:755 gh:756 # bucketized feature column to a dense feature in SDCA. This will # likely depend on the number of buckets. dense_bucket_tensor = column._to_dnn_input_layer( transformed_tensor) # pylint: disable=protected-access sparse_feature_column = _dense_tensor_to_sparse_feature_column( dense_bucket_tensor) sparse_feature_with_values.append(sparse_feature_column) # For bucketized columns, the variables list contains exactly one # element. sparse_feature_with_values_weights.append( columns_to_variables[column][0]) elif isinstance( column, ( layers.feature_column._CrossedColumn, # pylint: disable=protected-access layers.feature_column._SparseColumn)): # pylint: disable=protected-access sparse_features.append( SparseFeatureColumn( array_ops.reshape( array_ops.split( value=transformed_tensor.indices, num_or_size_splits=2, axis=1)[0], [-1]), array_ops.reshape(transformed_tensor.values, [-1]), None)) sparse_feature_weights.append( columns_to_variables[column][0]) elif isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access id_tensor = column.id_tensor(transformed_tensor) weight_tensor = column.weight_tensor(transformed_tensor) sparse_feature_with_values.append( SparseFeatureColumn( array_ops.reshape( array_ops.split(value=id_tensor.indices, num_or_size_splits=2, axis=1)[0], [-1]), array_ops.reshape(id_tensor.values, [-1]), array_ops.reshape(weight_tensor.values, [-1]))) sparse_feature_with_values_weights.append( columns_to_variables[column][0]) else: raise ValueError( 'SDCAOptimizer does not support column type %s.' % type(column).__name__) example_weights = array_ops.reshape( features[weight_column_name], shape=[ -1 ]) if weight_column_name else array_ops.ones([batch_size]) example_ids = features[self._example_id_column] sparse_feature_with_values.extend(sparse_features) sparse_feature_with_values_weights.extend(sparse_feature_weights) examples = dict(sparse_features=sparse_feature_with_values, dense_features=dense_features, example_labels=math_ops.to_float( array_ops.reshape(targets, shape=[-1])), example_weights=example_weights, example_ids=example_ids) sdca_variables = dict( sparse_features_weights=sparse_feature_with_values_weights, dense_features_weights=dense_feature_weights) return examples, sdca_variables