Example #1
0
def _sparse_intersect_indices(sp_tensor, required_sp_tensor):
  """Filters timestamps in sp_tensor to those present in required_sp_tensor."""
  # We extend both sp_tensor and required_sp_tensor with each others indices
  # so that they have the same indices.
  # E.g. their dense representation of one batch entry could be:
  # [dummy, dummy, 1 ]
  dummy_value = 'n/a'
  dummy_required_sp_tensor = _extend_with_dummy(
      sp_tensor, required_sp_tensor, dummy_value)
  dummy_sp_tensor = _extend_with_dummy(required_sp_tensor, sp_tensor,
                                       dummy_value)
  # We get rid to dummy values both for indices in the required_sp_tensor and
  # the sp_tensor.
  # First get rid of indices with dummy values in dummy_required_sp_tensor.
  in_required = tf.sparse_retain(
      dummy_sp_tensor,
      tf.logical_not(tf.equal(dummy_required_sp_tensor.values, dummy_value)))
  # Remove empty timesteps so that the timesteps align with the original
  # required_sp_tensor.
  # Then remove the indices with dummy values.
  in_required = tf.sparse_retain(
      _remove_empty_timesteps(in_required),
      tf.logical_not(tf.equal(in_required.values, dummy_value)))
  if sp_tensor.values.dtype != tf.string:
    in_required = tf.SparseTensor(
        indices=in_required.indices, dense_shape=in_required.dense_shape,
        values=tf.strings.to_number(
            in_required.values, out_type=sp_tensor.values.dtype))
  return in_required
Example #2
0
def sparse_dropout(x, keep_prob, noise_shape):
    """Dropout for sparse tensors."""
    random_tensor = keep_prob
    random_tensor += tf.random_uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.sparse_retain(x, dropout_mask)
    return pre_out * (1. / keep_prob)
Example #3
0
def sparse_dropout(x, keep_prob, noise_shape):
    """ Dropout for sparse tensors.

    Parameters
    ----------
    x : tf.sparse.SparseTensor
        SparseTensor as input.
    keep_prob : float
        keep_prob
    noise_shape : tuple

    Returns
    -------
    tf.sparse.SparseTensor : sparse tensor after applying dropout.

    Notes
    -----
    Recent tf.nn.dropout will use `rate` instead of `keep_prob`.

    See Also
    --------
    tf.compat.v1.nn.dropout : Computes dropout. (deprecated arguments)
    """
    random_tensor = keep_prob
    random_tensor += tf.random_uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.sparse_retain(x, dropout_mask)
    return pre_out * (1. / keep_prob)
Example #4
0
def dropout_sparse(x, keep_prob, num_nonzero_elems):
    """Dropout for sparse tensors. Currently fails for very large sparse tensors (>1M elements)
    """
    noise_shape = [num_nonzero_elems]
    random_tensor = keep_prob
    random_tensor += tf.random_uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.sparse_retain(x, dropout_mask)
    return pre_out * (1. / keep_prob)
Example #5
0
    def _dropout_sparse(self, X, keep_prob, n_nonzero_elems):
        """
        Dropout for sparse tensors.
        """
        noise_shape = [n_nonzero_elems]
        random_tensor = keep_prob
        random_tensor += tf.random_uniform(noise_shape)
        dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
        pre_out = tf.sparse_retain(X, dropout_mask)

        return pre_out * tf.div(1., keep_prob)
def construct_input(sequence_feature_map, categorical_values,
                    categorical_seq_feature, feature_value, mode, normalize,
                    momentum, min_value, max_value, input_keep_prob):
  """Returns a function to build the model.

  Args:
    sequence_feature_map: A dictionary of (Sparse)Tensors of dense shape
      [batch_size, max_sequence_length, None] keyed by the feature name.
    categorical_values: Potential values of the categorical_seq_feature.
    categorical_seq_feature: Name of feature of observation code.
    feature_value: Name of feature of observation value.
    mode: The execution mode, as defined in tf.estimator.ModeKeys.
    normalize: Whether to normalize each lab test.
    momentum: For the batch normalization mean and variance will be updated as
      momentum*old_value + (1-momentum) * new_value.
    min_value: Observation values smaller than this will be capped to min_value.
    max_value: Observation values larger than this will be capped to max_value.
    input_keep_prob: Keep probability for input observation values.

  Returns:
    - diff_delta_time: Tensor of shape [batch_size, max_seq_length, 1]
      with the
    - obs_values: A dense representation of the observation_values with
                  obs_values[b, t, :] has at most one non-zero value at the
                  position of the corresponding lab test from obs_code_ids with
                  the value of the lab result. A padded Tensor of shape
                  [batch_size, max_sequence_length, vocab_size] of type float32
                  of possibly normalized observation values.
    - indicator: A one-hot encoding of whether a value in obs_values comes from
                 observation_values or is just filled in to be 0. A Tensor of
                 shape [batch_size, max_sequence_length, vocab_size] and type
                 float32.
  """
  with tf.variable_scope('input'):
    sequence_feature_map = {
        k: tf.sparse_reorder(s) if isinstance(s, tf.SparseTensor) else s
        for k, s in sequence_feature_map.items()
    }
    # Filter out invalid values.
    # For invalid observation values we do this through a sparse retain.
    # This makes sure that the invalid values will not be considered in the
    # normalization.
    observation_values = sequence_feature_map[feature_value]
    observation_code_sparse = sequence_feature_map[categorical_seq_feature]
    # Future work: Create a flag for the missing value indicator.
    valid_values = tf.abs(observation_values.values - 9999999.0) > TOLERANCE
    # apply input dropout
    if input_keep_prob < 1.0:
      random_tensor = input_keep_prob
      random_tensor += tf.random_uniform(tf.shape(observation_values.values))
      # 0. if [input_keep_prob, 1.0) and 1. if [1.0, 1.0 + input_keep_prob)
      dropout_mask = tf.floor(random_tensor)
      if mode == tf.estimator.ModeKeys.TRAIN:
        valid_values = tf.to_float(valid_values) * dropout_mask
        valid_values = valid_values > 0.5
    sequence_feature_map[feature_value] = tf.sparse_retain(
        observation_values, valid_values)
    sequence_feature_map[categorical_seq_feature] = tf.sparse_retain(
        observation_code_sparse, valid_values)

    # 1. Construct the sequence of observation values to feed into the RNN
    #    and their indicator.
    # We assign each observation code an id from 0 to vocab_size-1. At each
    # timestep we will lookup the id for the observation code and take the value
    # of the lab test and a construct a vector with all zeros but the id-th
    # position is set to the lab test value.
    obs_code = sequence_feature_map[categorical_seq_feature]
    obs_code_dense_ids = contrib_lookup.index_table_from_tensor(
        tuple(categorical_values), num_oov_buckets=0,
        name='vocab_lookup').lookup(obs_code.values)
    obs_code_sparse = tf.SparseTensor(
        values=obs_code_dense_ids,
        indices=obs_code.indices,
        dense_shape=obs_code.dense_shape)
    obs_code_sparse = tf.sparse_reorder(obs_code_sparse)
    observation_values = sequence_feature_map[feature_value]
    observation_values = tf.sparse_reorder(observation_values)
    vocab_size = len(categorical_values)
    obs_values, indicator = combine_observation_code_and_values(
        obs_code_sparse, observation_values, vocab_size, mode, normalize,
        momentum, min_value, max_value)

    # 2. We compute the diff_delta_time as additional sequence feature.
    # Note, the LSTM is very sensitive to how you encode time.
    delta_time = sequence_feature_map['deltaTime']
    diff_delta_time = tf.concat(
        [delta_time[:, :1, :], delta_time[:, :-1, :]], axis=1) - delta_time
    diff_delta_time = tf.to_float(diff_delta_time) / (60.0 * 60.0)

  return (diff_delta_time, obs_values, indicator)
Example #7
0
    def _process(examples):
        """Supplies input to our model.

    This function supplies input to our model after parsing.

    Args:
      examples: The dictionary from key to (Sparse)Tensors with context
        and sequence features.

    Returns:
      A tuple consisting of 1) a dictionary of tensors whose keys are
      the feature names, and 2) a tensor of target labels if the mode
      is not INFER (and None, otherwise).
    """
        # Combine into a single dictionary.
        feature_map = {}
        # Add age if requested.
        if include_age:
            age_in_seconds = (
                examples[CONTEXT_KEY_PREFIX + 'timestamp'] -
                examples.pop(CONTEXT_KEY_PREFIX + 'Patient.birthDate'))
            age_in_years = tf.to_float(age_in_seconds) / (60 * 60 * 24 * 365.0)
            feature_map[CONTEXT_KEY_PREFIX + AGE_KEY] = age_in_years

        sequence_length = examples.pop(CONTEXT_KEY_PREFIX + 'sequenceLength')
        # Cross the requested features.
        for cross in time_crossed_features:
            # The features may be missing at different rates - we take the union
            # of the indices supplying defaults.
            extended_features = dict()
            dense_shape = tf.concat(
                [[tf.to_int64(tf.shape(sequence_length)[0])],
                 [tf.reduce_max(sequence_length)],
                 tf.constant([1], dtype=tf.int64)],
                axis=0)
            for i, feature in enumerate(cross):
                sp_tensor = examples[SEQUENCE_KEY_PREFIX + feature]
                additional_indices = []
                covered_indices = sp_tensor.indices
                for j, other_feature in enumerate(cross):
                    if i != j:
                        additional_indices.append(
                            tf.sets.set_difference(
                                tf.sparse_reorder(
                                    tf.SparseTensor(
                                        indices=examples[
                                            SEQUENCE_KEY_PREFIX +
                                            other_feature].indices,
                                        values=tf.zeros([
                                            tf.shape(examples[
                                                SEQUENCE_KEY_PREFIX +
                                                other_feature].indices)[0]
                                        ],
                                                        dtype=tf.int32),
                                        dense_shape=dense_shape)),
                                tf.sparse_reorder(
                                    tf.SparseTensor(
                                        indices=covered_indices,
                                        values=tf.zeros(
                                            [tf.shape(covered_indices)[0]],
                                            dtype=tf.int32),
                                        dense_shape=dense_shape))).indices)
                        covered_indices = tf.concat([sp_tensor.indices] +
                                                    additional_indices,
                                                    axis=0)

                additional_indices = tf.concat(additional_indices, axis=0)

                # Supply defaults for all other indices.
                default = tf.tile(tf.constant(['n/a']),
                                  multiples=[tf.shape(additional_indices)[0]])

                string_value = sp_tensor.values
                if string_value.dtype != tf.string:
                    string_value = tf.as_string(string_value)

                extended_features[feature] = tf.sparse_reorder(
                    tf.SparseTensor(indices=tf.concat(
                        [sp_tensor.indices, additional_indices], axis=0),
                                    values=tf.concat([string_value, default],
                                                     axis=0),
                                    dense_shape=dense_shape))

            new_values = tf.strings.join(
                [extended_features[f].values for f in cross], separator='-')
            crossed_sp_tensor = tf.sparse_reorder(
                tf.SparseTensor(
                    indices=extended_features[cross[0]].indices,
                    values=new_values,
                    dense_shape=extended_features[cross[0]].dense_shape))
            examples[SEQUENCE_KEY_PREFIX + '_'.join(cross)] = crossed_sp_tensor
        # Remove unwanted features that are used in the cross but should not be
        # considered outside the cross.
        for cross in time_crossed_features:
            for feature in cross:
                if (feature not in sequence_features
                        and SEQUENCE_KEY_PREFIX + feature in examples):
                    del examples[SEQUENCE_KEY_PREFIX + feature]

        # Flatten sparse tensor to compute event age. This dense tensor also
        # contains padded values. These will not be used when gathering elements
        # from the dense tensor since each sparse feature won't have a value
        # defined for the padding.
        padded_event_age = (
            # Broadcast current time along sequence dimension.
            tf.expand_dims(examples.pop(CONTEXT_KEY_PREFIX + 'timestamp'), 1)
            # Subtract time of events.
            - examples.pop(SEQUENCE_KEY_PREFIX + 'eventId'))

        for i in range(len(time_windows) - 1):
            max_age = time_windows[i]
            min_age = time_windows[i + 1]
            padded_in_time_window = tf.logical_and(padded_event_age <= max_age,
                                                   padded_event_age > min_age)

            for k, v in examples.iteritems():
                if k.startswith(CONTEXT_KEY_PREFIX):
                    continue
                # For each sparse feature entry, look up whether it is in the time
                # window or not.
                in_time_window = tf.gather_nd(padded_in_time_window,
                                              v.indices[:, 0:2])
                v = tf.sparse_retain(v, in_time_window)
                sp_tensor = tf.sparse_reshape(v, [v.dense_shape[0], -1])
                if dedup:
                    sp_tensor = _dedup_tensor(sp_tensor)

                feature_map[k + '-til-%d' % min_age] = sp_tensor

        for k, v in examples.iteritems():
            if k.startswith(CONTEXT_KEY_PREFIX):
                feature_map[k] = v
        return feature_map