Example #1
0
def _get_transformed_features(features, feature_columns):
    """Gets the transformed features from features/feature_columns pair.

  Args:
    features: a dicionary of name to Tensor.
    feature_columns: a list/set of tf.feature_column.

  Returns:
    result_features: a list of the transformed features, sorted by the name.

  Raises:
    ValueError: when unsupported features/columns are tried.
  """
    # pylint:disable=protected-access
    for fc in feature_columns:
        if not isinstance(fc, feature_column_lib._BucketizedColumn):
            raise ValueError(
                'For now, only bucketized_column is supported but '
                'got: {}'.format(fc))
    transformed_features = feature_column_lib._transform_features(
        features, feature_columns)
    # pylint:enable=protected-access
    result_features = []
    for column in sorted(transformed_features, key=lambda tc: tc.name):
        source_name = column.source_column.name
        squeezed_tensor = array_ops.squeeze(transformed_features[column],
                                            axis=1)
        if len(squeezed_tensor.shape) > 1:
            raise ValueError(
                'For now, only supports features equivalent to rank 1 '
                'but column `{}` got: {}'.format(source_name,
                                                 features[source_name].shape))
        result_features.append(squeezed_tensor)
    return result_features
Example #2
0
def _get_transformed_features(features, feature_columns):
  """Gets the transformed features from features/feature_columns pair.

  Args:
    features: a dicionary of name to Tensor.
    feature_columns: a list/set of tf.feature_column.

  Returns:
    result_features: a list of the transformed features, sorted by the name.

  Raises:
    ValueError: when unsupported features/columns are tried.
  """
  # pylint:disable=protected-access
  for fc in feature_columns:
    if not isinstance(fc, feature_column_lib._BucketizedColumn):
      raise ValueError('For now, only bucketized_column is supported but '
                       'got: {}'.format(fc))
  transformed_features = feature_column_lib._transform_features(
      features, feature_columns)
  # pylint:enable=protected-access
  result_features = []
  for column in sorted(transformed_features, key=lambda tc: tc.name):
    source_name = column.source_column.name
    squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
    if len(squeezed_tensor.shape) > 1:
      raise ValueError('For now, only supports features equivalent to rank 1 '
                       'but column `{}` got: {}'.format(
                           source_name, features[source_name].shape))
    result_features.append(squeezed_tensor)
  return result_features
Example #3
0
def _get_transformed_features(features, sorted_feature_columns):
    """Gets the transformed features from features/feature_columns pair.

  Args:
    features: a dicionary of name to Tensor.
    sorted_feature_columns: a list/set of tf.feature_column, sorted by name.

  Returns:
    result_features: a list of the transformed features, sorted by the name.

  Raises:
    ValueError: when unsupported features/columns are tried.
  """
    # pylint:disable=protected-access
    transformed_features = feature_column_lib._transform_features(
        features, sorted_feature_columns)
    result_features = []
    for column in sorted_feature_columns:
        if isinstance(column, feature_column_lib._BucketizedColumn):
            source_name = column.source_column.name
            squeezed_tensor = array_ops.squeeze(transformed_features[column],
                                                axis=1)
            if len(squeezed_tensor.shape) > 1:
                raise ValueError(
                    'For now, only supports features equivalent to rank 1 '
                    'but column `{}` got: {}'.format(
                        source_name, features[source_name].shape))
            result_features.append(squeezed_tensor)
        elif isinstance(column, feature_column_lib._IndicatorColumn):
            source_name = column.categorical_column.name
            tensor = math_ops.to_int32(transformed_features[column])
            if len(tensor.shape) > 2:
                raise ValueError(
                    'Rank of indicator column must be no more than 2, '
                    'but column `{}` got: {}'.format(
                        source_name, features[source_name].shape))
            unstacked = array_ops.unstack(tensor, axis=1)
            result_features.extend(unstacked)
        else:
            raise ValueError(
                'For now, only bucketized_column and indicator_column is supported '
                'but got: {}'.format(column))
        # pylint:enable=protected-access

    return result_features
Example #4
0
def _get_transformed_features(features, sorted_feature_columns):
  """Gets the transformed features from features/feature_columns pair.

  Args:
    features: a dicionary of name to Tensor.
    sorted_feature_columns: a list/set of tf.feature_column, sorted by name.

  Returns:
    result_features: a list of the transformed features, sorted by the name.

  Raises:
    ValueError: when unsupported features/columns are tried.
  """
  # pylint:disable=protected-access
  transformed_features = feature_column_lib._transform_features(
      features, sorted_feature_columns)
  result_features = []
  for column in sorted_feature_columns:
    if isinstance(column, feature_column_lib._BucketizedColumn):
      source_name = column.source_column.name
      squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
      if len(squeezed_tensor.shape) > 1:
        raise ValueError('For now, only supports features equivalent to rank 1 '
                         'but column `{}` got: {}'.format(
                             source_name, features[source_name].shape))
      result_features.append(squeezed_tensor)
    elif isinstance(column, feature_column_lib._IndicatorColumn):
      source_name = column.categorical_column.name
      tensor = math_ops.to_int32(transformed_features[column])
      if len(tensor.shape) > 2:
        raise ValueError('Rank of indicator column must be no more than 2, '
                         'but column `{}` got: {}'.format(
                             source_name, features[source_name].shape))
      unstacked = array_ops.unstack(tensor, axis=1)
      result_features.extend(unstacked)
    else:
      raise ValueError(
          'For now, only bucketized_column and indicator_column is supported '
          'but got: {}'.format(column))
    # pylint:enable=protected-access

  return result_features
Example #5
0
    def _model_fn(features, labels, mode):
        """Function that returns predictions, training loss, and training op."""

        if (isinstance(features, ops.Tensor)
                or isinstance(features, sparse_tensor.SparseTensor)):
            features = {'features': features}
        if feature_columns:
            features = features.copy()

            if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
                features.update(
                    layers.transform_features(features, feature_columns))
            else:
                for fc in feature_columns:
                    tensor = fc_core._transform_features(features, [fc])[fc]  # pylint: disable=protected-access
                    features[fc.name] = tensor

        weights = None
        if weights_name and weights_name in features:
            weights = features.pop(weights_name)

        keys = None
        if keys_name and keys_name in features:
            keys = features.pop(keys_name)

        # If we're doing eval, optionally ignore device_assigner.
        # Also ignore device assigner if we're exporting (mode == INFER)
        dev_assn = device_assigner
        if (mode == model_fn_lib.ModeKeys.INFER
                or (local_eval and mode == model_fn_lib.ModeKeys.EVAL)):
            dev_assn = None

        graph_builder = graph_builder_class(params, device_assigner=dev_assn)

        logits, tree_paths, regression_variance = graph_builder.inference_graph(
            features)

        summary.scalar('average_tree_size', graph_builder.average_size())
        # For binary classification problems, convert probabilities to logits.
        # Includes hack to get around the fact that a probability might be 0 or 1.
        if not params.regression and params.num_classes == 2:
            class_1_probs = array_ops.slice(logits, [0, 1], [-1, 1])
            logits = math_ops.log(
                math_ops.maximum(
                    class_1_probs /
                    math_ops.maximum(1.0 - class_1_probs, EPSILON), EPSILON))

        # labels might be None if we're doing prediction (which brings up the
        # question of why we force everything to adhere to a single model_fn).
        training_graph = None
        training_hooks = []
        if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
            with ops.control_dependencies([logits.op]):
                training_graph = control_flow_ops.group(
                    graph_builder.training_graph(features,
                                                 labels,
                                                 input_weights=weights,
                                                 num_trainers=num_trainers,
                                                 trainer_id=trainer_id),
                    state_ops.assign_add(training_util.get_global_step(), 1))

        # Put weights back in
        if weights is not None:
            features[weights_name] = weights

        # TensorForest's training graph isn't calculated directly from the loss
        # like many other models.
        def _train_fn(unused_loss):
            return training_graph

        # Ops are run in lexigraphical order of their keys. Run the resource
        # clean-up op last.
        all_handles = graph_builder.get_all_resource_handles()
        ops_at_end = {
            '9: clean up resources':
            control_flow_ops.group(*[
                resource_variable_ops.destroy_resource_op(handle)
                for handle in all_handles
            ])
        }

        if report_feature_importances:
            ops_at_end['1: feature_importances'] = (
                graph_builder.feature_importances())

        training_hooks = [TensorForestRunOpAtEndHook(ops_at_end)]

        if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
            model_ops = model_head.create_model_fn_ops(features=features,
                                                       labels=labels,
                                                       mode=mode,
                                                       train_op_fn=_train_fn,
                                                       logits=logits,
                                                       scope=head_scope)

            if early_stopping_rounds:
                training_hooks.append(
                    TensorForestLossHook(early_stopping_rounds,
                                         early_stopping_loss_threshold=
                                         early_stopping_loss_threshold,
                                         loss_op=model_ops.loss))

            model_ops.training_hooks.extend(training_hooks)

            if keys is not None:
                model_ops.predictions[keys_name] = keys

            if params.inference_tree_paths:
                model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths

            model_ops.predictions[
                VARIANCE_PREDICTION_KEY] = regression_variance

            if include_all_in_serving:
                # In order to serve the variance we need to add the prediction dict
                # to output_alternatives dict.
                if not model_ops.output_alternatives:
                    model_ops.output_alternatives = {}
                model_ops.output_alternatives[ALL_SERVING_KEY] = (
                    constants.ProblemType.UNSPECIFIED, model_ops.predictions)

            return model_ops

        else:
            # Estimator spec
            estimator_spec = model_head.create_estimator_spec(
                features=features,
                mode=mode,
                labels=labels,
                train_op_fn=_train_fn,
                logits=logits)

            if early_stopping_rounds:
                training_hooks.append(
                    TensorForestLossHook(early_stopping_rounds,
                                         early_stopping_loss_threshold=
                                         early_stopping_loss_threshold,
                                         loss_op=estimator_spec.loss))

            estimator_spec = estimator_spec._replace(
                training_hooks=training_hooks +
                list(estimator_spec.training_hooks))
            if keys is not None:
                estimator_spec.predictions[keys_name] = keys
            if params.inference_tree_paths:
                estimator_spec.predictions[
                    TREE_PATHS_PREDICTION_KEY] = tree_paths
            estimator_spec.predictions[
                VARIANCE_PREDICTION_KEY] = regression_variance

            if include_all_in_serving:
                outputs = estimator_spec.export_outputs
                if not outputs:
                    outputs = {}
                outputs = {
                    ALL_SERVING_KEY: PredictOutput(estimator_spec.predictions)
                }
                print(estimator_spec.export_outputs)
                # In order to serve the variance we need to add the prediction dict
                # to output_alternatives dict.
                estimator_spec = estimator_spec._replace(
                    export_outputs=outputs)

            return estimator_spec
Example #6
0
def extract_features(features, feature_columns, use_core_columns):
  """Extracts columns from a dictionary of features.

  Args:
    features: `dict` of `Tensor` objects.
    feature_columns: A list of feature_columns.

  Returns:
    Seven values:
      - A list of all feature column names.
      - A list of dense floats.
      - A list of sparse float feature indices.
      - A list of sparse float feature values.
      - A list of sparse float feature shapes.
      - A list of sparse int feature indices.
      - A list of sparse int feature values.
      - A list of sparse int feature shapes.
  Raises:
    ValueError: if features is not valid.
  """
  if not features:
    raise ValueError("Features dictionary must be specified.")

  # Make a shallow copy of features to ensure downstream usage
  # is unaffected by modifications in the model function.
  features = copy.copy(features)
  if feature_columns:
    scope = "gbdt"
    with variable_scope.variable_scope(scope):
      feature_columns = list(feature_columns)
      transformed_features = collections.OrderedDict()
      for fc in feature_columns:
        # pylint: disable=protected-access
        if use_core_columns:
          # pylint: disable=protected-access
          tensor = fc_core._transform_features(features, [fc])[fc]
          transformed_features[fc.name] = tensor
        elif isinstance(fc, feature_column_lib._EmbeddingColumn):
          # pylint: enable=protected-access
          transformed_features[fc.name] = fc_core.input_layer(
              features, [fc], weight_collections=[scope])
        else:
          result = feature_column_ops.transform_features(features, [fc])
          if len(result) > 1:
            raise ValueError("Unexpected number of output features")
          transformed_features[fc.name] = result[list(result.keys())[0]]
    features = transformed_features

  dense_float_names = []
  dense_floats = []
  sparse_float_names = []
  sparse_float_indices = []
  sparse_float_values = []
  sparse_float_shapes = []
  sparse_int_names = []
  sparse_int_indices = []
  sparse_int_values = []
  sparse_int_shapes = []
  for key in sorted(features.keys()):
    tensor = features[key]
    if isinstance(tensor, sparse_tensor.SparseTensor):
      if tensor.values.dtype == dtypes.float32:
        sparse_float_names.append(key)
        sparse_float_indices.append(tensor.indices)
        sparse_float_values.append(tensor.values)
        sparse_float_shapes.append(tensor.dense_shape)
      elif tensor.values.dtype == dtypes.int64:
        sparse_int_names.append(key)
        sparse_int_indices.append(tensor.indices)
        sparse_int_values.append(tensor.values)
        sparse_int_shapes.append(tensor.dense_shape)
      else:
        raise ValueError("Unsupported sparse feature %s with dtype %s." %
                         (tensor.indices.name, tensor.dtype))
    else:
      if tensor.dtype == dtypes.float32:
        if len(tensor.shape) > 1 and tensor.shape[1] > 1:
          unstacked = array_ops.unstack(tensor, axis=1)
          for i in range(len(unstacked)):
            dense_float_names.append(_FEATURE_NAME_TEMPLATE % (key, i))
            dense_floats.append(array_ops.reshape(unstacked[i], [-1, 1]))
        else:
          dense_float_names.append(key)
          dense_floats.append(tensor)
      else:
        raise ValueError("Unsupported dense feature %s with dtype %s." %
                         (tensor.name, tensor.dtype))
  # Feature columns are logically organized into incrementing slots starting
  # from dense floats, then sparse floats then sparse ints.
  fc_names = (dense_float_names + sparse_float_names + sparse_int_names)
  return (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
          sparse_float_shapes, sparse_int_indices, sparse_int_values,
          sparse_int_shapes)
Example #7
0
  def _model_fn(features, labels, mode):
    """Function that returns predictions, training loss, and training op."""

    if (isinstance(features, ops.Tensor) or
        isinstance(features, sparse_tensor.SparseTensor)):
      features = {'features': features}
    if feature_columns:
      features = features.copy()

      if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
        features.update(layers.transform_features(features, feature_columns))
      else:
        for fc in feature_columns:
          tensor = fc_core._transform_features(features, [fc])[fc]  # pylint: disable=protected-access
          features[fc.name] = tensor

    weights = None
    if weights_name and weights_name in features:
      weights = features.pop(weights_name)

    keys = None
    if keys_name and keys_name in features:
      keys = features.pop(keys_name)

    # If we're doing eval, optionally ignore device_assigner.
    # Also ignore device assigner if we're exporting (mode == INFER)
    dev_assn = device_assigner
    if (mode == model_fn_lib.ModeKeys.INFER or
        (local_eval and mode == model_fn_lib.ModeKeys.EVAL)):
      dev_assn = None

    graph_builder = graph_builder_class(params,
                                        device_assigner=dev_assn)

    logits, tree_paths, regression_variance = graph_builder.inference_graph(
        features)

    summary.scalar('average_tree_size', graph_builder.average_size())
    # For binary classification problems, convert probabilities to logits.
    # Includes hack to get around the fact that a probability might be 0 or 1.
    if not params.regression and params.num_classes == 2:
      class_1_probs = array_ops.slice(logits, [0, 1], [-1, 1])
      logits = math_ops.log(
          math_ops.maximum(class_1_probs / math_ops.maximum(
              1.0 - class_1_probs, EPSILON), EPSILON))

    # labels might be None if we're doing prediction (which brings up the
    # question of why we force everything to adhere to a single model_fn).
    training_graph = None
    training_hooks = []
    if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
      with ops.control_dependencies([logits.op]):
        training_graph = control_flow_ops.group(
            graph_builder.training_graph(
                features, labels, input_weights=weights,
                num_trainers=num_trainers,
                trainer_id=trainer_id),
            state_ops.assign_add(training_util.get_global_step(), 1))

    # Put weights back in
    if weights is not None:
      features[weights_name] = weights

    # TensorForest's training graph isn't calculated directly from the loss
    # like many other models.
    def _train_fn(unused_loss):
      return training_graph


    # Ops are run in lexigraphical order of their keys. Run the resource
    # clean-up op last.
    all_handles = graph_builder.get_all_resource_handles()
    ops_at_end = {
        '9: clean up resources':
            control_flow_ops.group(*[
                resource_variable_ops.destroy_resource_op(handle)
                for handle in all_handles
            ])
    }

    if report_feature_importances:
      ops_at_end['1: feature_importances'] = (
          graph_builder.feature_importances())

    training_hooks = [TensorForestRunOpAtEndHook(ops_at_end)]

    if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
      model_ops = model_head.create_model_fn_ops(
          features=features,
          labels=labels,
          mode=mode,
          train_op_fn=_train_fn,
          logits=logits,
          scope=head_scope)

      if early_stopping_rounds:
        training_hooks.append(
            TensorForestLossHook(
                early_stopping_rounds,
                early_stopping_loss_threshold=early_stopping_loss_threshold,
                loss_op=model_ops.loss))

      model_ops.training_hooks.extend(training_hooks)

      if keys is not None:
        model_ops.predictions[keys_name] = keys

      if params.inference_tree_paths:
        model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths

      model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance

      if include_all_in_serving:
        # In order to serve the variance we need to add the prediction dict
        # to output_alternatives dict.
        if not model_ops.output_alternatives:
          model_ops.output_alternatives = {}
        model_ops.output_alternatives[ALL_SERVING_KEY] = (
            constants.ProblemType.UNSPECIFIED, model_ops.predictions)

      return model_ops

    else:
      # Estimator spec
      estimator_spec = model_head.create_estimator_spec(
          features=features,
          mode=mode,
          labels=labels,
          train_op_fn=_train_fn,
          logits=logits)

      if early_stopping_rounds:
        training_hooks.append(
            TensorForestLossHook(
                early_stopping_rounds,
                early_stopping_loss_threshold=early_stopping_loss_threshold,
                loss_op=estimator_spec.loss))

      estimator_spec = estimator_spec._replace(
          training_hooks=training_hooks + list(estimator_spec.training_hooks))
      if keys is not None:
        estimator_spec.predictions[keys_name] = keys
      if params.inference_tree_paths:
        estimator_spec.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
      estimator_spec.predictions[VARIANCE_PREDICTION_KEY] = regression_variance

      if include_all_in_serving:
        outputs = estimator_spec.export_outputs
        if not outputs:
          outputs = {}
        outputs = {ALL_SERVING_KEY: PredictOutput(estimator_spec.predictions)}
        print(estimator_spec.export_outputs)
        # In order to serve the variance we need to add the prediction dict
        # to output_alternatives dict.
        estimator_spec = estimator_spec._replace(export_outputs=outputs)

      return estimator_spec
Example #8
0
def extract_features(features, feature_columns, use_core_columns):
  """Extracts columns from a dictionary of features.

  Args:
    features: `dict` of `Tensor` objects.
    feature_columns: A list of feature_columns.

  Returns:
    Seven values:
      - A list of all feature column names.
      - A list of dense floats.
      - A list of sparse float feature indices.
      - A list of sparse float feature values.
      - A list of sparse float feature shapes.
      - A list of sparse int feature indices.
      - A list of sparse int feature values.
      - A list of sparse int feature shapes.
  Raises:
    ValueError: if features is not valid.
  """
  if not features:
    raise ValueError("Features dictionary must be specified.")

  # Make a shallow copy of features to ensure downstream usage
  # is unaffected by modifications in the model function.
  features = copy.copy(features)
  if feature_columns:
    scope = "gbdt"
    with variable_scope.variable_scope(scope):
      feature_columns = list(feature_columns)
      transformed_features = collections.OrderedDict()
      for fc in feature_columns:
        # pylint: disable=protected-access
        if use_core_columns:
          # pylint: disable=protected-access
          tensor = fc_core._transform_features(features, [fc])[fc]
          transformed_features[fc.name] = tensor
        elif isinstance(fc, feature_column_lib._EmbeddingColumn):
          # pylint: enable=protected-access
          transformed_features[fc.name] = fc_core.input_layer(
              features, [fc], weight_collections=[scope])
        else:
          result = feature_column_ops.transform_features(features, [fc])
          if len(result) > 1:
            raise ValueError("Unexpected number of output features")
          transformed_features[fc.name] = result[list(result.keys())[0]]
    features = transformed_features

  dense_float_names = []
  dense_floats = []
  sparse_float_names = []
  sparse_float_indices = []
  sparse_float_values = []
  sparse_float_shapes = []
  sparse_int_names = []
  sparse_int_indices = []
  sparse_int_values = []
  sparse_int_shapes = []
  for key in sorted(features.keys()):
    tensor = features[key]
    if isinstance(tensor, sparse_tensor.SparseTensor):
      if tensor.values.dtype == dtypes.float32:
        sparse_float_names.append(key)
        sparse_float_indices.append(tensor.indices)
        sparse_float_values.append(tensor.values)
        sparse_float_shapes.append(tensor.dense_shape)
      elif tensor.values.dtype == dtypes.int64:
        sparse_int_names.append(key)
        sparse_int_indices.append(tensor.indices)
        sparse_int_values.append(tensor.values)
        sparse_int_shapes.append(tensor.dense_shape)
      else:
        raise ValueError("Unsupported sparse feature %s with dtype %s." %
                         (tensor.indices.name, tensor.dtype))
    else:
      if tensor.dtype == dtypes.float32:
        if len(tensor.shape) > 1 and tensor.shape[1] > 1:
          unstacked = array_ops.unstack(tensor, axis=1)
          for i in range(len(unstacked)):
            dense_float_names.append(_FEATURE_NAME_TEMPLATE % (key, i))
            dense_floats.append(array_ops.reshape(unstacked[i], [-1, 1]))
        else:
          dense_float_names.append(key)
          dense_floats.append(tensor)
      else:
        raise ValueError("Unsupported dense feature %s with dtype %s." %
                         (tensor.name, tensor.dtype))
  # Feature columns are logically organized into incrementing slots starting
  # from dense floats, then sparse floats then sparse ints.
  fc_names = (dense_float_names + sparse_float_names + sparse_int_names)
  return (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
          sparse_float_shapes, sparse_int_indices, sparse_int_values,
          sparse_int_shapes)