def testSelectLastActivations(self): """Test `select_last_activations`.""" batch_size = 4 padded_length = 6 num_classes = 4 np.random.seed(4444) sequence_length = np.random.randint(0, padded_length + 1, batch_size) activations = np.random.rand(batch_size, padded_length, num_classes) last_activations_t = rnn_common.select_last_activations( constant_op.constant(activations, dtype=dtypes.float32), constant_op.constant(sequence_length, dtype=dtypes.int32)) with session.Session() as sess: last_activations = sess.run(last_activations_t) expected_activations_shape = [batch_size, num_classes] np.testing.assert_equal( expected_activations_shape, last_activations.shape, 'Wrong activations shape. Expected {}; got {}.'.format( expected_activations_shape, last_activations.shape)) for i in range(batch_size): actual_activations = last_activations[i, :] expected_activations = activations[i, sequence_length[i] - 1, :] np.testing.assert_almost_equal( expected_activations, actual_activations, err_msg='Unexpected logit value at index [{}, :].' ' Expected {}; got {}.'.format(i, expected_activations, actual_activations))
def testSelectLastActivations(self): """Test `select_last_activations`.""" batch_size = 4 padded_length = 6 num_classes = 4 np.random.seed(4444) sequence_length = np.random.randint(0, padded_length + 1, batch_size) activations = np.random.rand(batch_size, padded_length, num_classes) last_activations_t = rnn_common.select_last_activations( constant_op.constant(activations, dtype=dtypes.float32), constant_op.constant(sequence_length, dtype=dtypes.int32)) with session.Session() as sess: last_activations = sess.run(last_activations_t) expected_activations_shape = [batch_size, num_classes] np.testing.assert_equal( expected_activations_shape, last_activations.shape, 'Wrong activations shape. Expected {}; got {}.'.format( expected_activations_shape, last_activations.shape)) for i in range(batch_size): actual_activations = last_activations[i, :] expected_activations = activations[i, sequence_length[i] - 1, :] np.testing.assert_almost_equal( expected_activations, actual_activations, err_msg='Unexpected logit value at index [{}, :].' ' Expected {}; got {}.'.format(i, expected_activations, actual_activations))
def model_fn(features, labels, mode): color_name = features[COLOR_NAME_KEY] sequence_length = features[SEQUENCE_LENGTH_KEY] # Creating dense representation for the names # and then converting it to one hot representation dense_color_name = tf.sparse_tensor_to_dense( color_name, default_value=len(CHARACTERS)) color_name_onehot = tf.one_hot(dense_color_name, depth=len(CHARACTERS) + 1) # Each RNN layer will consist of a LSTM cell rnn_layers = [tf.contrib.rnn.LSTMCell(size) for size in rnn_cell_sizes] # Construct the layers multi_rnn_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers) # Runs the RNN model dynamically # more about it at: # https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn outputs, final_state = tf.nn.dynamic_rnn( cell=multi_rnn_cell, inputs=color_name_onehot, sequence_length=sequence_length, dtype=tf.float32) # Slice to keep only the last cell of the RNN last_activations = rnn_common.select_last_activations( outputs, sequence_length) # Construct dense layers on top of the last cell of the RNN for units in dnn_layer_sizes: last_activations = tf.layers.dense(last_activations, units, activation=tf.nn.relu) # Final dense layer for prediction predictions = tf.layers.dense(last_activations, label_dimension) loss = None train_op = None if mode != tf.contrib.learn.ModeKeys.INFER: loss = tf.losses.mean_squared_error(labels, predictions) if mode == tf.contrib.learn.ModeKeys.TRAIN: train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer=optimizer, learning_rate=learning_rate) return tf.contrib.learn.ModelFnOps(mode, predictions=predictions, loss=loss, train_op=train_op)
def compute_attention(seq_output, last_output, hidden_layer_dim, seq_mask, sequence_length): """Constructs attention of the last_output as query and the sequence output. The attention is the dot-product of the last_output (the final RNN output), with the seq_output (the RNN's output at each step). Here the final RNN output is considered as the "query" or "context" vector. The final attention output is a weighted sum of the RNN's outputs at all steps. Details: alpha_i = seq_output_i * last_output beta is then obtained by normalizing alpha: beta_i = exp(alpha_i) / sum_j exp(alpha_j) The new attention vector is then the beta-weighted sum over the seq_output: attention_vector = sum_i beta_i * seq_output_i If hidden_dim > 0 then before computing alpha the seq_output and the last_output are sent through two separate hidden layers. seq_output = hidden_layer(seq_output) last_output = hidden_layer(last_output) Args: seq_output: The raw rnn output of shape [batch_size, max_sequence_length, rnn_size]. last_output: The last output of the rnn of shape [batch_size, rnn_size]. hidden_layer_dim: If 0 no hidden layer is applied before multiplying the last_logits with the seq_logits. seq_mask: A Tensor of shape [batch_size, max_sequence_length, 1] indicating which timesteps are padded. sequence_length: Sequence length (before padding), Tensor of shape [batch_size]. Returns: Attention output with shape [batch_size, rnn_size]. The attention beta tensor. """ # Compute the weights. if hidden_layer_dim > 0: last_output = tf.layers.dense( last_output, hidden_layer_dim, activation=tf.nn.relu6) seq_output = tf.layers.dense( seq_output, hidden_layer_dim, activation=tf.nn.relu6) last_output = tf.expand_dims(last_output, 1) # [batch_size, 1, rnn_size] tmp = tf.multiply(seq_output, last_output) # dim 1: broadcast alpha_tensor = tf.reduce_sum(tmp, 2) # [b, max_seq_len] alpha_tensor *= tf.squeeze(seq_mask, axis=2) beta_tensor = tf.nn.softmax(alpha_tensor) # using default dim -1 beta_tensor = tf.expand_dims(beta_tensor, -1) # [b, max_seq_len, 1] # Compute weighted sum of the original rnn_outputs over all steps tmp = seq_output * beta_tensor # last dim: use "broadcast" rnn_outputs_weighted_sum = tf.reduce_sum(tmp, 1) # [b, rnn_size] last_beta = rnn_common.select_last_activations( beta_tensor, tf.to_int32(sequence_length)) tf.summary.histogram('last_beta_attention', last_beta) return rnn_outputs_weighted_sum, beta_tensor
def _single_value_predictions(activations, sequence_length, target_column, problem_type, predict_probabilities): """Maps `activations` from the RNN to predictions for single value models. If `predict_probabilities` is `False`, this function returns a `dict` containing single entry with key `PREDICTIONS_KEY`. If `predict_probabilities` is `True`, it will contain a second entry with key `PROBABILITIES_KEY`. The value of this entry is a `Tensor` of probabilities with shape `[batch_size, num_classes]`. Args: activations: Output from an RNN. Should have dtype `float32` and shape `[batch_size, padded_length, ?]`. sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32` containing the length of each sequence in the batch. If `None`, sequences are assumed to be unpadded. target_column: An initialized `TargetColumn`, calculate predictions. problem_type: Either `ProblemType.CLASSIFICATION` or `ProblemType.LINEAR_REGRESSION`. predict_probabilities: A Python boolean, indicating whether probabilities should be returned. Should only be set to `True` for classification/logistic regression problems. Returns: A `dict` mapping strings to `Tensors`. """ with ops.name_scope('SingleValuePrediction'): last_activations = rnn_common.select_last_activations( activations, sequence_length) predictions_name = (prediction_key.PredictionKey.CLASSES if problem_type == constants.ProblemType.CLASSIFICATION else prediction_key.PredictionKey.SCORES) if predict_probabilities: probabilities = target_column.logits_to_predictions( last_activations, proba=True) prediction_dict = { prediction_key.PredictionKey.PROBABILITIES: probabilities, predictions_name: math_ops.argmax(probabilities, 1)} else: predictions = target_column.logits_to_predictions( last_activations, proba=False) prediction_dict = {predictions_name: predictions} return prediction_dict
def construct_logits(diff_delta_time, obs_values, indicator, sequence_length, seq_mask, hparams, reuse): """Constructs logits through an RNN. Args: diff_delta_time: Difference between two consecutive time steps. obs_values: A dense representation of the observation_values with obs_values[b, t, :] has at most one non-zero value at the position of the corresponding lab test from obs_code_ids with the value of the lab result. A padded Tensor of shape [batch_size, max_sequence_length, vocab_size] of type float32 of possibly normalized observation values. indicator: A one-hot encoding of whether a value in obs_values comes from observation_values or is just filled in to be 0. A Tensor of shape [batch_size, max_sequence_length, vocab_size] and type float32. sequence_length: Sequence length (before padding), Tensor of shape [batch_size]. seq_mask: A Tensor of shape [batch_size, max_sequence_length, 1] indicating which timesteps are padded. hparams: Hyper parameters. reuse: Boolean indicator of whether to re-use the variables. Returns: - Logits: A Tensor of shape [batch, {max_sequence_length,1}, 1]. - Weights: Defaults to None. Only populated to a Tensor of shape [batch, max_sequence_length, 1] if hparams.use_rnn_attention is True. """ logits, raw_output = construct_rnn_logits( diff_delta_time, obs_values, indicator, sequence_length, hparams.rnn_size, hparams.variational_recurrent_keep_prob, hparams.variational_input_keep_prob, hparams.variational_output_keep_prob, reuse) if hparams.use_rnn_attention: with tf.variable_scope('logits/rnn/attention', reuse=reuse) as sc: last_logits = rnn_common.select_last_activations( raw_output, tf.to_int32(sequence_length)) weighted_final_output, weight = compute_attention( raw_output, last_logits, hparams.attention_hidden_layer_dim, seq_mask, sequence_length) return tf.layers.dense( weighted_final_output, 1, name=sc, reuse=reuse, activation=None), weight else: return logits, None
def _single_value_predictions(activations, sequence_length, target_column, problem_type, predict_probabilities): """Maps `activations` from the RNN to predictions for single value models. If `predict_probabilities` is `False`, this function returns a `dict` containing single entry with key `PREDICTIONS_KEY`. If `predict_probabilities` is `True`, it will contain a second entry with key `PROBABILITIES_KEY`. The value of this entry is a `Tensor` of probabilities with shape `[batch_size, num_classes]`. Args: activations: Output from an RNN. Should have dtype `float32` and shape `[batch_size, padded_length, ?]`. sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32` containing the length of each sequence in the batch. If `None`, sequences are assumed to be unpadded. target_column: An initialized `TargetColumn`, calculate predictions. problem_type: Either `ProblemType.CLASSIFICATION` or `ProblemType.LINEAR_REGRESSION`. predict_probabilities: A Python boolean, indicating whether probabilities should be returned. Should only be set to `True` for classification/logistic regression problems. Returns: A `dict` mapping strings to `Tensors`. """ with ops.name_scope('SingleValuePrediction'): last_activations = rnn_common.select_last_activations( activations, sequence_length) predictions_name = (prediction_key.PredictionKey.CLASSES if problem_type == constants.ProblemType.CLASSIFICATION else prediction_key.PredictionKey.SCORES) if predict_probabilities: probabilities = target_column.logits_to_predictions( last_activations, proba=True) prediction_dict = { prediction_key.PredictionKey.PROBABILITIES: probabilities, predictions_name: math_ops.argmax(probabilities, 1)} else: predictions = target_column.logits_to_predictions( last_activations, proba=False) prediction_dict = {predictions_name: predictions} return prediction_dict
def _single_value_loss( activations, labels, sequence_length, target_column, features): """Maps `activations` from the RNN to loss for multi value models. Args: activations: Output from an RNN. Should have dtype `float32` and shape `[batch_size, padded_length, ?]`. labels: A `Tensor` with length `[batch_size]`. sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32` containing the length of each sequence in the batch. If `None`, sequences are assumed to be unpadded. target_column: An initialized `TargetColumn`, calculate predictions. features: A `dict` containing the input and (optionally) sequence length information and initial state. Returns: A scalar `Tensor` containing the loss. """ with ops.name_scope('SingleValueLoss'): last_activations = rnn_common.select_last_activations( activations, sequence_length) return target_column.loss(last_activations, labels, features)
def _single_value_loss( activations, labels, sequence_length, target_column, features): """Maps `activations` from the RNN to loss for multi value models. Args: activations: Output from an RNN. Should have dtype `float32` and shape `[batch_size, padded_length, ?]`. labels: A `Tensor` with length `[batch_size]`. sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32` containing the length of each sequence in the batch. If `None`, sequences are assumed to be unpadded. target_column: An initialized `TargetColumn`, calculate predictions. features: A `dict` containing the input and (optionally) sequence length information and initial state. Returns: A scalar `Tensor` containing the loss. """ with ops.name_scope('SingleValueLoss'): last_activations = rnn_common.select_last_activations( activations, sequence_length) return target_column.loss(last_activations, labels, features)
def model_fn(features, labels, mode): """Creates the prediction, loss, and train ops. Args: features: A dictionary of tensors keyed by the feature name. labels: A dictionary of label tensors keyed by the label key. mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. Returns: EstimatorSpec with the mode, prediction, loss, train_op and output_alternatives a dictionary specifying the output for a servo request during serving. """ # 1. Construct input to RNN sequence_feature_map = { k: features[input_fn.SEQUENCE_KEY_PREFIX + k] for k in hparams.sequence_features } sequence_length = tf.squeeze( features[input_fn.CONTEXT_KEY_PREFIX + 'sequenceLength'], axis=1, name='sq_seq_len') tf.summary.scalar('sequence_length', tf.reduce_mean(sequence_length)) diff_delta_time, obs_values, indicator = construct_input( sequence_feature_map, hparams.categorical_values, hparams.categorical_seq_feature, hparams.feature_value, mode, hparams.normalize, hparams.momentum, hparams.min_value, hparams.max_value, hparams.input_keep_prob) seq_mask = tf.expand_dims( tf.sequence_mask(sequence_length, dtype=tf.float32), axis=2) logits, weights = construct_logits( diff_delta_time, obs_values, indicator, sequence_length, seq_mask, hparams, reuse=False) all_attribution_dict = {} if mode == tf.estimator.ModeKeys.TRAIN: if hparams.sequence_prediction: assert not hparams.use_rnn_attention # If we train a sequence_prediction we repeat the labels over time. label_tensor = labels[hparams.label_key] labels[hparams.label_key] = tf.tile( tf.expand_dims(label_tensor, 2), multiples=[1, tf.shape(logits)[1], 1]) if hparams.volatility_loss_factor > 0.0: volatility = tf.reduce_sum( tf.square(seq_mask * compute_prediction_diff_attribution(logits))) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, volatility * hparams.volatility_loss_factor) elif not hparams.use_rnn_attention: logits = rnn_common.select_last_activations( logits, tf.to_int32(sequence_length)) else: if hparams.sequence_prediction: last_logits = rnn_common.select_last_activations( logits, tf.to_int32(sequence_length)) else: last_logits = logits if mode == tf.estimator.ModeKeys.PREDICT: delta_time = sequence_feature_map['deltaTime'] all_attributions = {} if hparams.include_gradients_attribution: all_attributions['gradient_last'] = compute_gradient_attribution( last_logits, obs_values, indicator) if hparams.include_gradients_sum_time_attribution: assert not hparams.use_rnn_attention all_attributions['gradient_sum'] = compute_gradient_attribution( _predictions_for_gradients( logits, seq_mask, delta_time, hparams.attribution_max_delta_time, averaged=False), obs_values, indicator) if hparams.include_gradients_avg_time_attribution: assert not hparams.use_rnn_attention all_attributions['gradient_avg'] = compute_gradient_attribution( _predictions_for_gradients( logits, seq_mask, delta_time, hparams.attribution_max_delta_time, averaged=True), obs_values, indicator) if hparams.include_path_integrated_gradients_attribution: all_attributions['integrated_gradient'] = ( compute_path_integrated_gradient_attribution( obs_values, indicator, diff_delta_time, delta_time, sequence_length, seq_mask, hparams)) if hparams.use_rnn_attention: all_attributions['rnn_attention'] = weights if hparams.include_diff_sequence_prediction_attribution: all_attributions['diff_sequence'] = ( compute_prediction_diff_attribution(logits)) all_attribution_dict = {} for attribution_name, attribution in all_attributions.items(): attribution_dict = convert_attribution( attribution, sequence_feature_map, seq_mask, delta_time, hparams.attribution_threshold, hparams.attribution_max_delta_time, prefix=attribution_name + '-') all_attribution_dict.update(attribution_dict) if hparams.include_sequence_prediction: # Add the predictions at each time step to the attention dictionary. attribution_indices = tf.where(seq_mask > 0.5) all_attribution_dict['predictions'] = tf.sparse.expand_dims( tf.SparseTensor( indices=attribution_indices, values=tf.gather_nd( tf.sigmoid(logits), attribution_indices), dense_shape=tf.to_int64(tf.shape(delta_time))), axis=1) # At test/inference time we only make a single prediction even if we did # sequence_prediction during training. logits = last_logits seq_mask = None probabilities = tf.sigmoid(logits) classes = probabilities > 0.5 predictions = { PredictionKeys.LOGITS: logits, PredictionKeys.PROBABILITIES: probabilities, PredictionKeys.CLASSES: classes } # Calculate the loss for TRAIN and EVAL, but not PREDICT. if mode == tf.estimator.ModeKeys.PREDICT: loss = None else: loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels[hparams.label_key], logits=predictions[PredictionKeys.LOGITS]) if hparams.sequence_prediction: loss *= seq_mask loss = tf.reduce_mean(loss) regularization_losses = tf.losses.get_regularization_losses() if regularization_losses: tf.summary.scalar('loss/prior_regularization', loss) regularization_loss = tf.add_n(regularization_losses) tf.summary.scalar('loss/regularization_loss', regularization_loss) loss += regularization_loss tf.summary.scalar('loss', loss) train_op = None if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) optimizer = tf.contrib.estimator.clip_gradients_by_norm( optimizer, 6.0) train_op = tf.contrib.training.create_train_op( total_loss=loss, optimizer=optimizer, summarize_gradients=False) if mode != tf.estimator.ModeKeys.TRAIN: for k, v in all_attribution_dict.items(): if not isinstance(v, tf.SparseTensor): raise ValueError('Expect attributions to be in SparseTensor, ' 'getting %s for feature %s' % (v.__class__.__name__, k)) predictions['attention_attribution,%s,indices' % k] = v.indices predictions['attention_attribution,%s,values' % k] = v.values predictions['attention_attribution,%s,shape' % k] = v.dense_shape eval_metric_ops = {} if mode == tf.estimator.ModeKeys.EVAL: auc = tf.metrics.auc prob_k = PredictionKeys.PROBABILITIES class_k = PredictionKeys.CLASSES m = 'careful_interpolation' metric_fn_dict = { 'auc-roc': lambda l, p: auc(l, p[prob_k], curve='ROC', summation_method=m), 'auc-pr': lambda l, p: auc(l, p[prob_k], curve='PR', summation_method=m), 'accuracy': lambda l, p: tf.metrics.accuracy(l, p[class_k]), } for (k, f) in metric_fn_dict.items(): eval_metric_ops[k] = f(label_tensor, predictions) # Define the output for serving. export_outputs = {} if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { 'mortality': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)
def compute_path_integrated_gradient_attribution( obs_values, indicator, diff_delta_time, delta_time, sequence_length, seq_mask, hparams, construct_logits_fn=None): """Constructs the attribution of what inputs result in a higher prediction. Attribution here refers to the integrated gradients as defined here https://arxiv.org/pdf/1703.01365.pdf and approximated for the j-th variable via (x-x') * 1/num_steps * sum_{i=1}^{num_steps} of the derivative of F(x'+(x-x')*i/num_steps) w.r.t. its j-th input. where we take x' the most recent value before attribution_max_delta_time and x to be the subsequent observation values from the same lab test. x'+(x-x')*i/num_steps is the linear interpolation between x' and x. Args: obs_values: A dense representation of the observation_values with obs_values[b, t, :] has at most one non-zero value at the position of the corresponding lab test from obs_code_ids with the value of the lab result. A padded Tensor of shape [batch_size, max_sequence_length, vocab_size] of type float32 of possibly normalized observation values. indicator: A one-hot encoding of whether a value in obs_values comes from observation_values or is just filled in to be 0. A Tensor of shape [batch_size, max_sequence_length, vocab_size] and type float32. diff_delta_time: Difference between two consecutive time steps. delta_time: A Tensor of shape [batch_size, max_sequence_length] describing the time to prediction. sequence_length: Sequence length (before padding), Tensor of shape [batch_size]. seq_mask: A Tensor of shape [batch_size, max_sequence_length, 1] indicating which timesteps are padded. hparams: Hyper parameters. construct_logits_fn: A method with constructing the logits given input as construct_logits. If None using construct_logits. Returns: A Tensor of shape [batch, max_sequence_length, 1] of the gradient of the prediction as a function of the lab result at that batch-entry time. """ last_obs_values_0 = _most_recent_obs_value(obs_values, indicator, delta_time, hparams.attribution_max_delta_time) gradients = [] # We need to limit the diff over the base to timesteps after base. last_obs_values = last_obs_values_0 * ( tf.to_float(indicator) * tf.to_float(delta_time < hparams.attribution_max_delta_time)) obs_values_with_last_replaced = obs_values * tf.to_float( delta_time >= hparams.attribution_max_delta_time) + last_obs_values diff_over_base = obs_values - obs_values_with_last_replaced for i in range(hparams.path_integrated_gradients_num_steps): alpha = 1.0 * i / (hparams.path_integrated_gradients_num_steps - 1) step_obs_values = obs_values_with_last_replaced + diff_over_base * alpha if not construct_logits_fn: construct_logits_fn = construct_logits logits, _ = construct_logits_fn( diff_delta_time, step_obs_values, indicator, sequence_length, seq_mask, hparams, reuse=True) if hparams.use_rnn_attention: last_logits = logits else: last_logits = rnn_common.select_last_activations( logits, tf.to_int32(sequence_length)) # Ideally, we'd like to get the gradients of the change in # value over the previous one to attribute it to both and not just a single # value. gradient = compute_gradient_attribution(last_logits, step_obs_values, indicator) gradients.append( tf.reduce_sum(diff_over_base, axis=2, keepdims=True) * gradient) return tf.add_n(gradients) / tf.to_float( hparams.path_integrated_gradients_num_steps)
def _model_fn(features, labels, mode): color_name = features[COLOR_NAME_KEY] # int64 -> int32 sequence_length = tf.cast(features[SEQUENCE_LENGTH_KEY], dtype=tf.int32) # ----------- Preparing input -------------------- # Creating a tf constant to hold the map char -> index mapping = tf.constant(CHARACTERS, name='mapping') table = tf.contrib.lookup.index_table_from_tensor(mapping, dtype=tf.string) int_color_name = table.lookup(color_name) # representing colornames with one hot representation color_name_onehot = tf.one_hot(int_color_name, depth=len(CHARACTERS) + 1) # ---------- RNN ------------------- # Each RNN layer will consist of a LSTM cell rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in rnn_cell_sizes] # Construct the layers multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) # Runs the RNN model dynamically # more about it at: # https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn outputs, final_state = tf.nn.dynamic_rnn( cell=multi_rnn_cell, inputs=color_name_onehot, sequence_length=sequence_length, dtype=tf.float32) # Slice to keep only the last cell of the RNN last_activations = rnn_common.select_last_activations( outputs, sequence_length) # ------------ Dense layers ------------------- # Construct dense layers on top of the last cell of the RNN for units in dnn_layer_sizes: last_activations = tf.layers.dense(last_activations, units, activation=tf.nn.relu) # Final dense layer for prediction predictions = tf.layers.dense(last_activations, label_dimension) # ----------- Loss and Optimizer ---------------- loss = None train_op = None if mode != tf.estimator.ModeKeys.PREDICT: loss = tf.losses.mean_squared_error(labels, predictions) if mode == tf.estimator.ModeKeys.TRAIN: train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer=optimizer, learning_rate=learning_rate) return model_fn_lib.EstimatorSpec(mode, predictions=predictions, loss=loss, train_op=train_op)
def model_fn(features, labels, mode): x = features['x'] sequence_length = tf.cast(features[rnn_common.RNNKeys.SEQUENCE_LENGTH_KEY], tf.int32) # creating embedding for the reviews embedding = tf.contrib.layers.embed_sequence(x, vocab_size=num_words, embed_dim=embed_dim) # Each RNN layer will consist of a LSTM cell if len(dropout_keep_probabilities) == len(rnn_cell_sizes): if mode != tf.estimator.ModeKeys.TRAIN: rnn_layers = [ rnn.DropoutWrapper(rnn.LSTMCell(size), input_keep_prob=1, output_keep_prob=1, state_keep_prob=1) for size, keep_prob in rnn_cell_sizes] else: rnn_layers = [ rnn.DropoutWrapper(rnn.LSTMCell(size), input_keep_prob=keep_prob, output_keep_prob=keep_prob, state_keep_prob=keep_prob) for size, keep_prob in zip(rnn_cell_sizes, dropout_keep_probabilities)] else: rnn_layers = [rnn.LSTMCell(size) for size in rnn_cell_sizes] # Construct the layers multi_rnn_cell = rnn.MultiRNNCell(rnn_layers) # Runs the RNN model dynamically # more about it at: # https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn outputs, final_state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=embedding, sequence_length=sequence_length, dtype=tf.float32) # Slice to keep only the last cell of the RNN last_activations = rnn_common.select_last_activations(outputs, sequence_length) # Construct dense layers on top of the last cell of the RNN for units in dnn_layer_sizes: last_activations = tf.layers.dense(last_activations, units, activation=tf.nn.relu) # Final dense layer for prediction predictions = tf.layers.dense(last_activations, label_dimension) predictions_softmax = tf.nn.softmax(predictions) loss = None train_op = None eval_op = None if mode != tf.estimator.ModeKeys.PREDICT: labels_onehot = tf.one_hot(labels, 2) eval_op = { 'accuracy': tf.metrics.accuracy( tf.argmax(input=predictions_softmax, axis=1), tf.argmax(input=labels_onehot, axis=1)) } loss = tf.losses.softmax_cross_entropy(labels_onehot, predictions) if mode == tf.estimator.ModeKeys.TRAIN: train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer=optimizer, learning_rate=learning_rate) return tf.estimator.EstimatorSpec(mode, predictions=predictions_softmax, loss=loss, train_op=train_op, eval_metric_ops=eval_op)
def makeqnetwork(self, input_size, rnnshape, ffnshape, num_actions, training_input=None, training_sequence_lengths=None, inference_input=None, inference_hidden_state=None, scope_name="RNN"): """ Construct graph. :return: input placeholder, output layer, list of variables """ # Build brain model with tf.name_scope(scope_name + '/') as ns: rnn_layers = [tf.nn.rnn_cell.GRUCell(units) for units in rnnshape] multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) if training_input is not None and training_sequence_lengths is not None: state_in = training_input sequence_lengths = training_sequence_lengths else: state_in = tf.placeholder(shape=[None, None, input_size], dtype=tf.float32) sequence_lengths = tf.placeholder(shape=[None], dtype=tf.int32) outputs, hidden = tf.nn.dynamic_rnn( cell=multi_rnn_cell, inputs=state_in, sequence_length=sequence_lengths, dtype=tf.float32) layer = rnn_common.select_last_activations(outputs, sequence_lengths) if inference_input is not None and inference_hidden_state is not None: inference_in = inference_input inference_state = inference_hidden_state else: inference_in = tf.placeholder(shape=[None, input_size], dtype=tf.float32) inference_state = multi_rnn_cell.zero_state( tf.shape(inference_in)[0], dtype=tf.float32) inference_output, inference_hidden = multi_rnn_cell( inference_in, inference_state) inference_layer = inference_output for i, units in enumerate(ffnshape): layer = tf.layers.dense(layer, units, activation=tf.nn.relu, reuse=None, name="ffn_{}".format(i)) for i, units in enumerate(ffnshape): inference_layer = tf.layers.dense(inference_layer, units, activation=tf.nn.relu, reuse=True, name="ffn_{}".format(i)) # Make output layer without relu layer = tf.layers.dense(layer, num_actions, activation=None, reuse=None, name="ffn_last") inference_layer = tf.layers.dense(inference_layer, num_actions, activation=None, reuse=True, name="ffn_last") variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=ns) return (state_in, sequence_lengths, layer), (inference_in, inference_state, inference_layer, inference_hidden), variables