Ejemplo n.º 1
0
    def _training_examples_and_variables():
        """Returns dictionaries for training examples and variables."""
        batch_size = targets.get_shape()[0]

        # Iterate over all feature columns and create appropriate lists for dense
        # and sparse features as well as dense and sparse weights (variables) for
        # SDCA.
        # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
        # dict as 1-dimensional tensors.
        dense_features, sparse_features, sparse_feature_with_values = [], [], []
        dense_feature_weights = []
        sparse_feature_weights, sparse_feature_with_values_weights = [], []
        for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
            transformed_tensor = features[column]
            if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
                # A real-valued column corresponds to a dense feature in SDCA. A
                # transformed tensor corresponding to a RealValuedColumn has rank 2
                # (its shape is typically [batch_size, column.dimension]) and so it
                # can be passed to SDCA as is.
                dense_features.append(transformed_tensor)
                # For real valued columns, the variables list contains exactly one
                # element.
                dense_feature_weights.append(columns_to_variables[column][0])
            elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
                # A bucketized column corresponds to a sparse feature in SDCA. The
                # bucketized feature is "sparsified" for SDCA by converting it to a
                # SparseFeatureColumn respresenting the one-hot encoding of the
                # bucketized feature.
                #
                # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
                # bucketized feature column to a dense feature in SDCA. This will likely
                # depend on the number of buckets.
                dense_bucket_tensor = column._to_dnn_input_layer(
                    transformed_tensor)  # pylint: disable=protected-access
                sparse_feature_column = _dense_tensor_to_sparse_feature_column(
                    dense_bucket_tensor)
                sparse_feature_with_values.append(sparse_feature_column)
                # For bucketized columns, the variables list contains exactly one
                # element.
                sparse_feature_with_values_weights.append(
                    columns_to_variables[column][0])
            elif isinstance(
                    column,
                (
                    layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                    layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
                sparse_features.append(
                    SparseFeatureColumn(
                        array_ops.reshape(
                            array_ops.split(value=transformed_tensor.indices,
                                            num_or_size_splits=2,
                                            axis=1)[0], [-1]),
                        array_ops.reshape(transformed_tensor.values, [-1]),
                        None))
                sparse_feature_weights.append(columns_to_variables[column][0])
            elif isinstance(column,
                            layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
                id_tensor = column.id_tensor(transformed_tensor)
                weight_tensor = column.weight_tensor(transformed_tensor)
                sparse_feature_with_values.append(
                    SparseFeatureColumn(
                        array_ops.reshape(
                            array_ops.split(value=id_tensor.indices,
                                            num_or_size_splits=2,
                                            axis=1)[0], [-1]),
                        array_ops.reshape(id_tensor.values, [-1]),
                        array_ops.reshape(weight_tensor.values, [-1])))
                sparse_feature_with_values_weights.append(
                    columns_to_variables[column][0])
            else:
                raise ValueError(
                    "SDCAOptimizer does not support column type {}".format(
                        type(column).__name__))

        example_weights = array_ops.reshape(
            features[weight_column_name],
            shape=[-1]) if weight_column_name else array_ops.ones([batch_size])
        example_ids = features[optimizer.example_id_column]
        sparse_feature_with_values.extend(sparse_features)
        sparse_feature_with_values_weights.extend(sparse_feature_weights)
        examples = dict(sparse_features=sparse_feature_with_values,
                        dense_features=dense_features,
                        example_labels=math_ops.to_float(
                            array_ops.reshape(targets, shape=[-1])),
                        example_weights=example_weights,
                        example_ids=example_ids)
        sdca_variables = dict(
            sparse_features_weights=sparse_feature_with_values_weights,
            dense_features_weights=dense_feature_weights)
        return examples, sdca_variables
Ejemplo n.º 2
0
        def _training_examples_and_variables():
            """Returns dictionaries for training examples and variables."""
            batch_size = targets.get_shape()[0]

            # Iterate over all feature columns and create appropriate lists for dense
            # and sparse features as well as dense and sparse weights (variables) for
            # SDCA.
            # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
            # dict as 1-dimensional tensors.
            dense_features, sparse_features, sparse_feature_with_values = [], [], []
            dense_feature_weights = []
            sparse_feature_weights, sparse_feature_with_values_weights = [], []
            for column in sorted(columns_to_variables.keys(),
                                 key=lambda x: x.key):
                transformed_tensor = features[column]
                if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
                    # A real-valued column corresponds to a dense feature in SDCA. A
                    # transformed tensor corresponding to a RealValuedColumn should have
                    # rank at most 2. In order to be passed to SDCA, its rank needs to be
                    # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
                    check_rank_op = control_flow_ops.Assert(
                        math_ops.less_equal(array_ops.rank(transformed_tensor),
                                            2),
                        ['transformed_tensor should have rank at most 2.'])
                    # Reshape to [batch_size, dense_column_dimension].
                    with ops.control_dependencies([check_rank_op]):
                        transformed_tensor = array_ops.reshape(
                            transformed_tensor,
                            [array_ops.shape(transformed_tensor)[0], -1])

                    dense_features.append(transformed_tensor)
                    # For real valued columns, the variables list contains exactly one
                    # element.
                    dense_feature_weights.append(
                        columns_to_variables[column][0])
                elif isinstance(column,
                                layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
                    # A bucketized column corresponds to a sparse feature in SDCA. The
                    # bucketized feature is "sparsified" for SDCA by converting it to a
                    # SparseFeatureColumn representing the one-hot encoding of the
                    # bucketized feature.
                    #
                    # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
                    # bucketized feature column to a dense feature in SDCA. This will
                    # likely depend on the number of buckets.
                    dense_bucket_tensor = column._to_dnn_input_layer(
                        transformed_tensor)  # pylint: disable=protected-access
                    sparse_feature_column = _dense_tensor_to_sparse_feature_column(
                        dense_bucket_tensor)
                    sparse_feature_with_values.append(sparse_feature_column)
                    # If a partitioner was used during variable creation, we will have a
                    # list of Variables here larger than 1.
                    vars_to_append = columns_to_variables[column][0]
                    if len(columns_to_variables[column]) > 1:
                        vars_to_append = columns_to_variables[column]
                    sparse_feature_with_values_weights.append(vars_to_append)
                elif isinstance(
                        column,
                    (
                        layers.feature_column._WeightedSparseColumn,  # pylint: disable=protected-access
                        layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                        layers.feature_column._SparseColumn)):  # pylint: disable=protected-access

                    if isinstance(column,
                                  layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
                        id_tensor = column.id_tensor(transformed_tensor)
                        weight_tensor = array_ops.reshape(
                            column.weight_tensor(transformed_tensor).values,
                            [-1])
                    else:
                        id_tensor = transformed_tensor
                        weight_tensor = array_ops.ones(
                            [array_ops.shape(id_tensor.indices)[0]],
                            dtypes.float32)

                    example_ids = array_ops.reshape(id_tensor.indices[:, 0],
                                                    [-1])

                    flat_ids = array_ops.reshape(id_tensor.values, [-1])
                    # Prune invalid IDs (< 0) from the flat_ids, example_ids, and
                    # weight_tensor.  These can come from looking up an OOV entry in the
                    # vocabulary (default value being -1).
                    is_id_valid = math_ops.greater_equal(flat_ids, 0)
                    flat_ids = array_ops.boolean_mask(flat_ids, is_id_valid)
                    example_ids = array_ops.boolean_mask(
                        example_ids, is_id_valid)
                    weight_tensor = array_ops.boolean_mask(
                        weight_tensor, is_id_valid)

                    projection_length = math_ops.reduce_max(flat_ids) + 1
                    # project ids based on example ids so that we can dedup ids that
                    # occur multiple times for a single example.
                    projected_ids = projection_length * example_ids + flat_ids

                    # Remove any redundant ids.
                    ids, idx = array_ops.unique(projected_ids)
                    # Keep only one example id per duplicated ids.
                    example_ids_filtered = math_ops.unsorted_segment_min(
                        example_ids, idx,
                        array_ops.shape(ids)[0])

                    # reproject ids back feature id space.
                    reproject_ids = (ids -
                                     projection_length * example_ids_filtered)

                    weights = array_ops.reshape(
                        math_ops.unsorted_segment_sum(weight_tensor, idx,
                                                      array_ops.shape(ids)[0]),
                        [-1])
                    sparse_feature_with_values.append(
                        SparseFeatureColumn(example_ids_filtered,
                                            reproject_ids, weights))
                    # If a partitioner was used during variable creation, we will have a
                    # list of Variables here larger than 1.
                    vars_to_append = columns_to_variables[column][0]
                    if len(columns_to_variables[column]) > 1:
                        vars_to_append = columns_to_variables[column]
                    sparse_feature_with_values_weights.append(vars_to_append)
                else:
                    raise ValueError(
                        'SDCAOptimizer does not support column type %s.' %
                        type(column).__name__)

            example_weights = array_ops.reshape(
                features[weight_column_name], shape=[
                    -1
                ]) if weight_column_name else array_ops.ones([batch_size])
            example_ids = features[self._example_id_column]
            sparse_feature_with_values.extend(sparse_features)
            sparse_feature_with_values_weights.extend(sparse_feature_weights)
            examples = dict(sparse_features=sparse_feature_with_values,
                            dense_features=dense_features,
                            example_labels=math_ops.cast(
                                array_ops.reshape(targets, shape=[-1]),
                                dtypes.float32),
                            example_weights=example_weights,
                            example_ids=example_ids)
            sdca_variables = dict(
                sparse_features_weights=sparse_feature_with_values_weights,
                dense_features_weights=dense_feature_weights)
            return examples, sdca_variables
Ejemplo n.º 3
0
        def _training_examples_and_variables():
            """Returns dictionaries for training examples and variables."""
            batch_size = targets.get_shape()[0]

            # Iterate over all feature columns and create appropriate lists for dense
            # and sparse features as well as dense and sparse weights (variables) for
            # SDCA.
            # TODO (sibyl-vie3Poto): Reshape variables stored as values in column_to_variables id:814 gh:815
            # dict as 1-dimensional tensors.
            dense_features, sparse_features, sparse_feature_with_values = [], [], []
            dense_feature_weights = []
            sparse_feature_weights, sparse_feature_with_values_weights = [], []
            for column in sorted(columns_to_variables.keys(),
                                 key=lambda x: x.key):
                transformed_tensor = features[column]
                if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
                    # A real-valued column corresponds to a dense feature in SDCA. A
                    # transformed tensor corresponding to a RealValuedColumn should have
                    # rank at most 2. In order to be passed to SDCA, its rank needs to be
                    # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
                    check_rank_op = control_flow_ops.Assert(
                        math_ops.less_equal(array_ops.rank(transformed_tensor),
                                            2),
                        ['transformed_tensor shouls have rank at most 2.'])
                    # Reshape to [batch_size, dense_column_dimension].
                    with ops.control_dependencies([check_rank_op]):
                        transformed_tensor = array_ops.reshape(
                            transformed_tensor,
                            [array_ops.shape(transformed_tensor)[0], -1])

                    dense_features.append(transformed_tensor)
                    # For real valued columns, the variables list contains exactly one
                    # element.
                    dense_feature_weights.append(
                        columns_to_variables[column][0])
                elif isinstance(column,
                                layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
                    # A bucketized column corresponds to a sparse feature in SDCA. The
                    # bucketized feature is "sparsified" for SDCA by converting it to a
                    # SparseFeatureColumn respresenting the one-hot encoding of the
                    # bucketized feature.
                    #
                    # TODO (sibyl-vie3Poto): Explore whether it is more efficient to translate a id:755 gh:756
                    # bucketized feature column to a dense feature in SDCA. This will
                    # likely depend on the number of buckets.
                    dense_bucket_tensor = column._to_dnn_input_layer(
                        transformed_tensor)  # pylint: disable=protected-access
                    sparse_feature_column = _dense_tensor_to_sparse_feature_column(
                        dense_bucket_tensor)
                    sparse_feature_with_values.append(sparse_feature_column)
                    # For bucketized columns, the variables list contains exactly one
                    # element.
                    sparse_feature_with_values_weights.append(
                        columns_to_variables[column][0])
                elif isinstance(
                        column,
                    (
                        layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                        layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
                    sparse_features.append(
                        SparseFeatureColumn(
                            array_ops.reshape(
                                array_ops.split(
                                    value=transformed_tensor.indices,
                                    num_or_size_splits=2,
                                    axis=1)[0], [-1]),
                            array_ops.reshape(transformed_tensor.values, [-1]),
                            None))
                    sparse_feature_weights.append(
                        columns_to_variables[column][0])
                elif isinstance(column,
                                layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
                    id_tensor = column.id_tensor(transformed_tensor)
                    weight_tensor = column.weight_tensor(transformed_tensor)
                    sparse_feature_with_values.append(
                        SparseFeatureColumn(
                            array_ops.reshape(
                                array_ops.split(value=id_tensor.indices,
                                                num_or_size_splits=2,
                                                axis=1)[0], [-1]),
                            array_ops.reshape(id_tensor.values, [-1]),
                            array_ops.reshape(weight_tensor.values, [-1])))
                    sparse_feature_with_values_weights.append(
                        columns_to_variables[column][0])
                else:
                    raise ValueError(
                        'SDCAOptimizer does not support column type %s.' %
                        type(column).__name__)

            example_weights = array_ops.reshape(
                features[weight_column_name], shape=[
                    -1
                ]) if weight_column_name else array_ops.ones([batch_size])
            example_ids = features[self._example_id_column]
            sparse_feature_with_values.extend(sparse_features)
            sparse_feature_with_values_weights.extend(sparse_feature_weights)
            examples = dict(sparse_features=sparse_feature_with_values,
                            dense_features=dense_features,
                            example_labels=math_ops.to_float(
                                array_ops.reshape(targets, shape=[-1])),
                            example_weights=example_weights,
                            example_ids=example_ids)
            sdca_variables = dict(
                sparse_features_weights=sparse_feature_with_values_weights,
                dense_features_weights=dense_feature_weights)
            return examples, sdca_variables