def test_crf_viterbi(self, num_tags, num_words): model = CNNModelHelper(name='external') predictions = np.random.randn(num_words, num_tags).astype(np.float32) transitions = np.random.uniform(low=-1, high=1, size=(num_tags + 2, num_tags + 2)).astype(np.float32) predictions_blob, transitions_blob = (model.net.AddExternalInputs( 'predictions', 'crf_transitions')) workspace.FeedBlob(str(transitions_blob), transitions) workspace.FeedBlob(str(predictions_blob), predictions) crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob) updated_predictions = crf_update_predictions(model, crf_layer, predictions_blob) ref_predictions = crf_layer.update_predictions(predictions_blob) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) updated_predictions = workspace.FetchBlob(str(updated_predictions)) ref_predictions = workspace.FetchBlob(str(ref_predictions)) np.testing.assert_allclose(updated_predictions, ref_predictions, atol=1e-4, rtol=1e-4, err_msg='Mismatch in CRF predictions')
def test_crf_gradient(self, num_tags, num_words): base_model = CNNModelHelper(name='base_model') transitions = np.random.randn(num_tags + 2, num_tags + 2).astype(np.float32) predictions = np.random.randn(num_words, 1, num_tags + 2).astype(np.float32) initial = np.random.randn(1, num_tags + 2).astype(np.float32) predictions_blob, transitions_blob, initial_blob = ( base_model.net.AddExternalInputs('predictions_blob', 'crf_transitions', 'inital_blob')) workspace.FeedBlob(str(predictions_blob), predictions) workspace.FeedBlob(str(transitions_blob), transitions) workspace.FeedBlob(str(initial_blob), initial) crf_layer = crf.CRFWithLoss(base_model, num_tags, transitions_blob) crf_layer.build_crf_net(predictions_blob, initial_blob, transitions_blob) op = base_model.net._net.op[-1] workspace.RunNetOnce(base_model.param_init_net) gradients_to_check = (index for (index, input_name) in enumerate(op.input) if input_name != "crf_net/zero_segment_id") inputs = [workspace.FetchBlob(name) for name in op.input] for param in gradients_to_check: self.assertGradientChecks( device_option=hu.cpu_do, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=[1], threshold=0.05, stepsize=0.001, )
def lstm(self, create_lstm, t, n, d, ref, outputs_with_grads): model = CNNModelHelper(name='external') input_blob, seq_lengths, hidden_init, cell_init = ( model.net.AddExternalInputs('input_blob', 'seq_lengths', 'hidden_init', 'cell_init')) create_lstm(model, input_blob, seq_lengths, (hidden_init, cell_init), d, d, scope="external/recurrent", outputs_with_grads=outputs_with_grads) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) input_blob = op.input[0] def generate_random_state(n, d): ndim = int(np.random.choice(3, 1)) + 1 if ndim == 1: return np.random.randn(1, n, d).astype(np.float32) random_state = np.random.randn(n, d).astype(np.float32) if ndim == 3: random_state = random_state.reshape([1, n, d]) return random_state workspace.FeedBlob(str(input_blob), np.random.randn(t, n, d * 4).astype(np.float32)) workspace.FeedBlob("hidden_init", generate_random_state(n, d)) workspace.FeedBlob("cell_init", generate_random_state(n, d)) workspace.FeedBlob( "seq_lengths", np.random.randint(1, t + 1, size=(n, )).astype(np.int32)) inputs = [workspace.FetchBlob(name) for name in op.input] print(op.input) print(inputs) self.assertReferenceChecks( hu.cpu_do, op, inputs, ref, outputs_to_check=range(4), ) # Checking for input, gates_t_w and gates_t_b gradients for param in range(5): self.assertGradientChecks( device_option=hu.cpu_do, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=outputs_with_grads, threshold=0.01, stepsize=0.005, )
def test_cnn_model_helper_deprecated(self): X = np.random.rand(64, 32, 32, 3).astype(np.float32) - 0.5 workspace.FeedBlob("x", X) # CNNModelHelper is going to be deprecated soon. This test is only # covering some CNNModelHelper logic model = CNNModelHelper(name="test_model", order='NHWC') self.assertEqual(model.arg_scope['order'], 'NHWC')
def test_lstm_new(self, t, n, d): model = CNNModelHelper(name='external') def create_lstm( model, input_blob, seq_lengths, init, dim_in, dim_out, scope): recurrent.LSTM( model, input_blob, seq_lengths, init, dim_in, dim_out, scope="external/recurrent") self.lstm(model, create_lstm, t, n, d, lstm_reference, gradients_to_check=[0, 1, 2, 3, 4], outputs_to_check=[0, 1, 2, 3])
def test_crf_with_loss_op(self, num_tags, num_words): model = CNNModelHelper(name='external') embeddings_dim = 200 embeddings = np.random.randn(num_words, embeddings_dim).astype(np.float32) transitions = np.random.uniform(low=-1, high=1, size=(num_tags + 2, num_tags + 2)).astype(np.float32) labels = np.random.randint(num_tags, size=(num_words)).astype(np.int64) embeddings_blob, labels_blob, transitions_blob = ( model.net.AddExternalInputs('embeddings_blob', 'labels_blob', 'crf_transitions')) workspace.FeedBlob(str(embeddings_blob), embeddings) workspace.FeedBlob(str(labels_blob), labels) workspace.FeedBlob(str(transitions_blob), transitions) predictions_blob = model.FC(embeddings_blob, "fc_0", embeddings_dim, num_tags, ('UniformFill', { 'min': -1.0 }, { 'max': 1.0 }), ('UniformFill', { 'min': -1.0 }, { 'max': 1.0 })) crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob) crf_loss = crf_layer.crf_loss(predictions_blob, labels_blob) model.net.AddGradientOperators([crf_loss]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) loss = workspace.FetchBlob(str(crf_loss)) predictions = workspace.FetchBlob(str(predictions_blob)) np.testing.assert_allclose( loss, self._compute_loss_manual(predictions, num_tags, labels, transitions), atol=0.001, rtol=0.001, err_msg='CRF LOSS is not matching the reference')
def test_lstm_old(self, t, n, d): model = CNNModelHelper(name='external') def create_lstm( model, input_blob, seq_lengths, init, dim_in, dim_out, scope): model.LSTM( input_blob, seq_lengths, init, dim_in, dim_out, scope="external/recurrent") # CNNModelHelper.LSTM returns only 3 outputs. But the operator itself # returns 5. We ignore the rest. self.lstm(model, create_lstm, t, n, d, old_lstm_reference, gradients_to_check=[0, 2, 3, 4, 5], outputs_to_check=[0, 3, 4])
def test_milstm(self, t, n, d): for outputs_with_grads in [[0], [1], [0, 1, 2, 3]]: model = CNNModelHelper(name='external') def create_milstm( model, input_blob, seq_lengths, init, dim_in, dim_out, scope): recurrent.MILSTM( model, input_blob, seq_lengths, init, dim_in, dim_out, scope="external/recurrent", outputs_with_grads=outputs_with_grads) self.lstm(model, create_milstm, t, n, d, milstm_reference, gradients_to_check=[0, 1, 2, 3, 4], outputs_to_check=[0, 1, 2, 3], outputs_with_grads=outputs_with_grads)
def apply_over_sequence( self, model, inputs, seq_lengths, initial_states, outputs_with_grads=None, ): preprocessed_inputs = self.prepare_input(model, inputs) step_model = CNNModelHelper(name=self.name, param_model=model) input_t, timestep = step_model.net.AddScopedExternalInputs( 'input_t', 'timestep', ) states_prev = step_model.net.AddScopedExternalInputs(*[ s + '_prev' for s in self.get_state_names() ]) states = self._apply( model=step_model, input_t=input_t, seq_lengths=seq_lengths, states=states_prev, timestep=timestep, ) if outputs_with_grads is None: outputs_with_grads = self.get_outputs_with_grads() # states_for_all_steps consits of combination of # states gather for all steps and final states. It looks like this: # (state_1_all, state_1_final, state_2_all, state_2_final, ...) states_for_all_steps = recurrent.recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, preprocessed_inputs)], initial_cell_inputs=zip(states_prev, initial_states), links=dict(zip(states_prev, states)), timestep=timestep, scope=self.name, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=self.recompute_blobs, forward_only=self.forward_only, ) output = self._prepare_output_sequence( model, states_for_all_steps, outputs_with_grads, ) return output, states_for_all_steps
def _prepare_lstm(t, n, d, create_lstm, outputs_with_grads, memory_optim, forget_bias, forward_only, drop_states): print("Dims: ", t, n, d) model = CNNModelHelper(name='external') input_blob, seq_lengths, hidden_init, cell_init = ( model.net.AddExternalInputs('input_blob', 'seq_lengths', 'hidden_init', 'cell_init')) create_lstm( model, input_blob, seq_lengths, (hidden_init, cell_init), d, d, scope="external/recurrent", outputs_with_grads=outputs_with_grads, memory_optimization=memory_optim, forget_bias=forget_bias, forward_only=forward_only, drop_states=drop_states, ) workspace.RunNetOnce(model.param_init_net) def generate_random_state(n, d): ndim = int(np.random.choice(3, 1)) + 1 if ndim == 1: return np.random.randn(1, n, d).astype(np.float32) random_state = np.random.randn(n, d).astype(np.float32) if ndim == 3: random_state = random_state.reshape([1, n, d]) return random_state workspace.FeedBlob("hidden_init", generate_random_state(n, d)) workspace.FeedBlob("cell_init", generate_random_state(n, d)) workspace.FeedBlob( "seq_lengths", np.random.randint(1, t + 1, size=(n, )).astype(np.int32)) return model.net
def apply_over_sequence( self, model, inputs, seq_lengths, initial_states, outputs_with_grads=None, ): preprocessed_inputs = self.prepare_input(model, inputs) step_model = CNNModelHelper(name=self.name, param_model=model) input_t, timestep = step_model.net.AddScopedExternalInputs( 'input_t', 'timestep', ) states_prev = step_model.net.AddScopedExternalInputs( *[s + '_prev' for s in self.get_state_names()]) states = self._apply( model=step_model, input_t=input_t, seq_lengths=seq_lengths, states=states_prev, timestep=timestep, ) return recurrent.recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, preprocessed_inputs)], initial_cell_inputs=zip(states_prev, initial_states), links=dict(zip(states_prev, states)), timestep=timestep, scope=self.name, outputs_with_grads=(outputs_with_grads if outputs_with_grads is not None else self.get_outputs_with_grads()), recompute_blobs_on_backward=self.recompute_blobs, forward_only=self.forward_only, )
def MILSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, ), memory_optimization=False, forget_bias=0.0): ''' Adds MI flavor of standard LSTM recurrent network operator to a model. See https://arxiv.org/pdf/1606.06630.pdf model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a tupple of (hidden_input_blob, cell_input_blob) which are going to be inputs to the cell net on the first iteration dim_in: input dimention dim_out: output dimention outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation memory_optimization: if enabled, the LSTM step is recomputed on backward step so that we don't need to store forward activations for each timestep. Saves memory with cost of computation. ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) """ initial bulk fully-connected """ input_blob = model.FC(input_blob, s('i2h'), dim_in=dim_in, dim_out=4 * dim_out, axis=2) """ the step net """ step_model = CNNModelHelper(name='milstm_cell', param_model=model) input_t, timestep, cell_t_prev, hidden_t_prev = ( step_model.net.AddScopedExternalInputs('input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev')) # hU^T # Shape: [1, batch_size, 4 * hidden_size] prev_t = step_model.FC(hidden_t_prev, s('prev_t'), dim_in=dim_out, dim_out=4 * dim_out, axis=2) # defining MI parameters alpha = step_model.param_init_net.ConstantFill([], [s('alpha')], shape=[4 * dim_out], value=1.0) beta1 = step_model.param_init_net.ConstantFill([], [s('beta1')], shape=[4 * dim_out], value=1.0) beta2 = step_model.param_init_net.ConstantFill([], [s('beta2')], shape=[4 * dim_out], value=1.0) b = step_model.param_init_net.ConstantFill([], [s('b')], shape=[4 * dim_out], value=0.0) model.params.extend([alpha, beta1, beta2, b]) # alpha * (xW^T * hU^T) # Shape: [1, batch_size, 4 * hidden_size] alpha_tdash = step_model.net.Mul([prev_t, input_t], s('alpha_tdash')) # Shape: [batch_size, 4 * hidden_size] alpha_tdash_rs, _ = step_model.net.Reshape( alpha_tdash, [s('alpha_tdash_rs'), s('alpha_tdash_old_shape')], shape=[-1, 4 * dim_out], ) alpha_t = step_model.net.Mul([alpha_tdash_rs, alpha], s('alpha_t'), broadcast=1, use_grad_hack=1) # beta1 * hU^T # Shape: [batch_size, 4 * hidden_size] prev_t_rs, _ = step_model.net.Reshape( prev_t, [s('prev_t_rs'), s('prev_t_old_shape')], shape=[-1, 4 * dim_out], ) beta1_t = step_model.net.Mul([prev_t_rs, beta1], s('beta1_t'), broadcast=1, use_grad_hack=1) # beta2 * xW^T # Shape: [batch_szie, 4 * hidden_size] input_t_rs, _ = step_model.net.Reshape( input_t, [s('input_t_rs'), s('input_t_old_shape')], shape=[-1, 4 * dim_out], ) beta2_t = step_model.net.Mul([input_t_rs, beta2], s('beta2_t'), broadcast=1, use_grad_hack=1) # Add 'em all up gates_tdash = step_model.net.Sum([alpha_t, beta1_t, beta2_t], s('gates_tdash')) gates_t = step_model.net.Add([gates_tdash, b], s('gates_t'), broadcast=1, use_grad_hack=1) # # Shape: [1, batch_size, 4 * hidden_size] gates_t_rs, _ = step_model.net.Reshape( gates_t, [s('gates_t_rs'), s('gates_t_old_shape')], shape=[1, -1, 4 * dim_out], ) hidden_t, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t_rs, seq_lengths, timestep], [s('hidden_t'), s('cell_t')], forget_bias=forget_bias, ) step_model.net.AddExternalOutputs(cell_t, hidden_t) """ recurrent network """ (hidden_input_blob, cell_input_blob) = initial_states output, last_output, all_states, last_state = recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (hidden_t_prev, hidden_input_blob), (cell_t_prev, cell_input_blob), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=[gates_t] if memory_optimization else None) return output, last_output, all_states, last_state
def LSTMWithAttention( model, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, encoder_output_dim, encoder_outputs, decoder_input_dim, decoder_state_dim, scope, attention_type=AttentionType.Regular, outputs_with_grads=(0, 4), weighted_encoder_outputs=None, lstm_memory_optimization=False, attention_memory_optimization=False, forget_bias=0.0, ): ''' Adds a LSTM with attention mechanism to a model. The implementation is based on https://arxiv.org/abs/1409.0473, with a small difference in the order how we compute new attention context and new hidden state, similarly to https://arxiv.org/abs/1508.04025. The model uses encoder-decoder naming conventions, where the decoder is the sequence the op is iterating over, while computing the attention context over the encoder. model: CNNModelHelper object new operators would be added to decoder_inputs: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention decoder_input_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_decoder_hidden_state: initial hidden state of LSTM initial_decoder_cell_state: initial cell state of LSTM initial_attention_weighted_encoder_context: initial attention context encoder_output_dim: dimension of encoder outputs encoder_outputs: the sequence, on which we compute the attention context at every iteration decoder_input_dim: input dimention (last dimension on decoder_inputs) decoder_state_dim: size of hidden states of LSTM attention_type: One of: AttentionType.Regular, AttentionType.Recurrent. Determines which type of attention mechanism to use. outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation weighted_encoder_outputs: encoder outputs to be used to compute attention weights. In the basic case it's just linear transformation of encoder outputs (that the default, when weighted_encoder_outputs is None). However, it can be something more complicated - like a separate encoder network (for example, in case of convolutional encoder) lstm_memory_optimization: recompute LSTM activations on backward pass, so we don't need to store their values in forward passes attention_memory_optimization: recompute attention for backward pass ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) decoder_inputs = model.FC( decoder_inputs, s('i2h'), dim_in=decoder_input_dim, dim_out=4 * decoder_state_dim, axis=2, ) # [batch_size, encoder_output_dim, encoder_length] encoder_outputs_transposed = model.Transpose( encoder_outputs, s('encoder_outputs_transposed'), axes=[1, 2, 0], ) if weighted_encoder_outputs is None: weighted_encoder_outputs = model.FC( encoder_outputs, s('weighted_encoder_outputs'), dim_in=encoder_output_dim, dim_out=encoder_output_dim, axis=2, ) step_model = CNNModelHelper( name='lstm_with_attention_cell', param_model=model, ) ( input_t, timestep, cell_t_prev, hidden_t_prev, attention_weighted_encoder_context_t_prev, ) = (step_model.net.AddScopedExternalInputs( 'input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev', 'attention_weighted_encoder_context_t_prev', )) step_model.net.AddExternalInputs(encoder_outputs_transposed, weighted_encoder_outputs) gates_concatenated_input_t, _ = step_model.net.Concat( [hidden_t_prev, attention_weighted_encoder_context_t_prev], [ s('gates_concatenated_input_t'), s('_gates_concatenated_input_t_concat_dims'), ], axis=2, ) gates_t = step_model.FC( gates_concatenated_input_t, s('gates_t'), dim_in=decoder_state_dim + encoder_output_dim, dim_out=4 * decoder_state_dim, axis=2, ) step_model.net.Sum([gates_t, input_t], gates_t) hidden_t_intermediate, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t, decoder_input_lengths, timestep], ['hidden_t_intermediate', s('cell_t')], forget_bias=forget_bias, ) if attention_type == AttentionType.Recurrent: attention_weighted_encoder_context_t, _, attention_blobs = apply_recurrent_attention( model=step_model, encoder_output_dim=encoder_output_dim, encoder_outputs_transposed=encoder_outputs_transposed, weighted_encoder_outputs=weighted_encoder_outputs, decoder_hidden_state_t=hidden_t_intermediate, decoder_hidden_state_dim=decoder_state_dim, scope=scope, attention_weighted_encoder_context_t_prev=( attention_weighted_encoder_context_t_prev), ) else: attention_weighted_encoder_context_t, _, attention_blobs = apply_regular_attention( model=step_model, encoder_output_dim=encoder_output_dim, encoder_outputs_transposed=encoder_outputs_transposed, weighted_encoder_outputs=weighted_encoder_outputs, decoder_hidden_state_t=hidden_t_intermediate, decoder_hidden_state_dim=decoder_state_dim, scope=scope, ) hidden_t = step_model.Copy(hidden_t_intermediate, s('hidden_t')) step_model.net.AddExternalOutputs( cell_t, hidden_t, attention_weighted_encoder_context_t, ) recompute_blobs = [] if attention_memory_optimization: recompute_blobs.extend(attention_blobs) if lstm_memory_optimization: recompute_blobs.extend([gates_t]) return recurrent_net( net=model.net, cell_net=step_model.net, inputs=[ (input_t, decoder_inputs), ], initial_cell_inputs=[ (hidden_t_prev, initial_decoder_hidden_state), (cell_t_prev, initial_decoder_cell_state), ( attention_weighted_encoder_context_t_prev, initial_attention_weighted_encoder_context, ), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, attention_weighted_encoder_context_t_prev: (attention_weighted_encoder_context_t), }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=recompute_blobs, )
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, ), return_params=False, memory_optimization=False, forget_bias=0.0): ''' Adds a standard LSTM recurrent network operator to a model. model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a tupple of (hidden_input_blob, cell_input_blob) which are going to be inputs to the cell net on the first iteration dim_in: input dimention dim_out: output dimention outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation return_params: if True, will return a dictionary of parameters of the LSTM memory_optimization: if enabled, the LSTM step is recomputed on backward step so that we don't need to store forward activations for each timestep. Saves memory with cost of computation. ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) """ initial bulk fully-connected """ input_blob = model.FC(input_blob, s('i2h'), dim_in=dim_in, dim_out=4 * dim_out, axis=2) """ the step net """ step_model = CNNModelHelper(name='lstm_cell', param_model=model) input_t, timestep, cell_t_prev, hidden_t_prev = ( step_model.net.AddScopedExternalInputs('input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev')) gates_t = step_model.FC(hidden_t_prev, s('gates_t'), dim_in=dim_out, dim_out=4 * dim_out, axis=2) step_model.net.Sum([gates_t, input_t], gates_t) hidden_t, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep], [s('hidden_t'), s('cell_t')], forget_bias=forget_bias, ) step_model.net.AddExternalOutputs(cell_t, hidden_t) """ recurrent network """ (hidden_input_blob, cell_input_blob) = initial_states output, last_output, all_states, last_state = recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (hidden_t_prev, hidden_input_blob), (cell_t_prev, cell_input_blob), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=[gates_t] if memory_optimization else None) if return_params: params = { 'input': { 'weights': input_blob + "_w", 'biases': input_blob + '_b' }, 'recurrent': { 'weights': gates_t + "_w", 'biases': gates_t + '_b' } } return output, last_output, all_states, last_state, params else: return output, last_output, all_states, last_state
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, )): ''' Adds a standard LSTM recurrent network operator to a model. model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a tupple of (hidden_input_blob, cell_input_blob) which are going to be inputs to the cell net on the first iteration dim_in: input dimention dim_out: output dimention outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) """ initial bulk fully-connected """ input_blob = model.FC(input_blob, s('i2h'), dim_in=dim_in, dim_out=4 * dim_out, axis=2) """ the step net """ step_model = CNNModelHelper(name='lstm_cell', param_model=model) input_t, timestep, cell_t_prev, hidden_t_prev = ( step_model.net.AddScopedExternalInputs('input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev')) gates_t = step_model.FC(hidden_t_prev, s('gates_t'), dim_in=dim_out, dim_out=4 * dim_out, axis=2) step_model.net.Sum([gates_t, input_t], gates_t) hidden_t, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep], [s('hidden_t'), s('cell_t')], ) step_model.net.AddExternalOutputs(cell_t, hidden_t) """ recurrent network """ (hidden_input_blob, cell_input_blob) = initial_states output, last_output, all_states, last_state = recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (hidden_t_prev, hidden_input_blob), (cell_t_prev, cell_input_blob), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, ) return output, last_output, all_states, last_state
def build_crf_net(self, input_blob, initial_state, transitions): ''' Adds the crf_net recurrent operator to the model. model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention ##Only supports batch-size 1## seq_lengths: blob containing sequence lengths (unused) ''' scope = 'crf_net' def s(name): '' # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) step_model = CNNModelHelper(name='crf_step', param_model=self.model) input_t, cell_t_prev, _ = (step_model.net.AddExternalInputs( 'input_t', 'cell_t_prev', transitions)) zero_segment_id = step_model.param_init_net.ConstantFill( [], [s('zero_segment_id')], value=0, shape=[self.num_classes_padded], dtype=core.DataType.INT32, ) # A hack to bypass model cloning for test step_model.param_init_net.AddExternalOutput(zero_segment_id) """ the CRF step """ # Do tile prev_transpose = step_model.Transpose( cell_t_prev, [s('prev_transpose')], axes=(0, 2, 1), ) prev_tiled = step_model.net.Tile( prev_transpose, [s('prev_tiled')], tiles=self.num_classes_padded, axis=2, ) input_t_tiled = step_model.net.Tile( input_t, [s('input_t_tiled')], tiles=self.num_classes_padded, axis=1, ) input_with_prev = step_model.net.Add([prev_tiled, input_t_tiled], [s('input_with_prev')]) all_with_transitions = step_model.net.Add( [input_with_prev, transitions], [s('prev_with_transitions')], broadcast=1, use_grad_hack=1, ) all_with_transitions_reshaped, _ = step_model.net.Reshape( all_with_transitions, [ s('all_with_transitions_reshaped'), s('all_with_transitions_orig') ], shape=(self.num_classes_padded, self.num_classes_padded)) cell_t = step_model.net.SortedSegmentRangeLogSumExp( [all_with_transitions_reshaped, zero_segment_id], [s('cell_t')], ) step_model.net.AddExternalOutputs(cell_t) """ recurrent network """ cell_input_blob = initial_state out_all, out_last = recurrent.recurrent_net(net=self.model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (cell_t_prev, cell_input_blob), ], links={ cell_t_prev: cell_t, }, scope=scope, outputs_with_grads=(1, )) return out_last
with open(args.params, 'r') as f: init_def.ParseFromString(f.read()) init_def.device_option.CopyFrom(device_opts) workspace.RunNetOnce(init_def) net_def = caffe2_pb2.NetDef() with open(net_path, 'r') as f: net_def.ParseFromString(f.read()) net_def.device_option.CopyFrom(device_opts) for op in net_def.op: op.engine = 'CUDNN' workspace.CreateNet(net_def) elif args.network.startswith('resnet') or args.network == 'alexnet': if args.network.startswith('resnet'): model = CNNModelHelper(order='NCHW', name=args.network, use_cudnn=True, cudnn_exhaustive_search=True) num_layers = int(args.network[6:]) softmax = create_resnet(model, 'data', num_layers=num_layers, num_input_channels=3, num_labels=1000, label=None, no_bias=True, no_loss=True) elif args.network == 'alexnet': model = CNNModelHelper( order='NCHW', name=args.network, # use_cudnn=True,
def test_multi_lstm( self, input_length, dim_in, max_num_units, num_layers, batch_size, gc, dc, ): model = CNNModelHelper(name='external') with core.DeviceScope(gc): ( input_sequence, seq_lengths, ) = model.net.AddExternalInputs( 'input_sequence', 'seq_lengths', ) dim_out = [ np.random.randint(1, max_num_units + 1) for _ in range(num_layers) ] h_all, h_last, c_all, c_last = rnn_cell.LSTM( model=model, input_blob=input_sequence, seq_lengths=seq_lengths, initial_states=None, dim_in=dim_in, dim_out=dim_out, scope='test', outputs_with_grads=(0, ), return_params=False, memory_optimization=False, forget_bias=0.0, forward_only=False, return_last_layer_only=True, ) workspace.RunNetOnce(model.param_init_net) seq_lengths_val = np.random.randint( 1, input_length + 1, size=(batch_size), ).astype(np.int32) input_sequence_val = np.random.randn( input_length, batch_size, dim_in, ).astype(np.float32) workspace.FeedBlob(seq_lengths, seq_lengths_val) workspace.FeedBlob(input_sequence, input_sequence_val) hidden_input_list = [] cell_input_list = [] i2h_w_list = [] i2h_b_list = [] gates_w_list = [] gates_b_list = [] for i in range(num_layers): hidden_input_list.append( workspace.FetchBlob( 'test/initial_hidden_state_{}'.format(i)), ) cell_input_list.append( workspace.FetchBlob('test/initial_cell_state_{}'.format(i)), ) i2h_w_list.append( workspace.FetchBlob('test/layer_{}/i2h_w'.format(i)), ) i2h_b_list.append( workspace.FetchBlob('test/layer_{}/i2h_b'.format(i)), ) gates_w_list.append( workspace.FetchBlob('test/layer_{}/gates_t_w'.format(i)), ) gates_b_list.append( workspace.FetchBlob('test/layer_{}/gates_t_b'.format(i)), ) workspace.RunNetOnce(model.net) h_all_calc = workspace.FetchBlob(h_all) h_last_calc = workspace.FetchBlob(h_last) c_all_calc = workspace.FetchBlob(c_all) c_last_calc = workspace.FetchBlob(c_last) h_all_ref, h_last_ref, c_all_ref, c_last_ref = multi_lstm_reference( input_sequence_val, hidden_input_list, cell_input_list, i2h_w_list, i2h_b_list, gates_w_list, gates_b_list, seq_lengths_val, forget_bias=0.0, ) h_all_delta = np.abs(h_all_ref - h_all_calc).sum() h_last_delta = np.abs(h_last_ref - h_last_calc).sum() c_all_delta = np.abs(c_all_ref - c_all_calc).sum() c_last_delta = np.abs(c_last_ref - c_last_calc).sum() self.assertAlmostEqual(h_all_delta, 0.0, places=5) self.assertAlmostEqual(h_last_delta, 0.0, places=5) self.assertAlmostEqual(c_all_delta, 0.0, places=5) self.assertAlmostEqual(c_last_delta, 0.0, places=5) input_values = { 'input_sequence': input_sequence_val, 'seq_lengths': seq_lengths_val, } for param in model.GetParams(): value = workspace.FetchBlob(param) input_values[str(param)] = value output_sum = model.net.SumElements( [h_all], 'output_sum', average=True, ) fake_loss = model.net.Tanh(output_sum, ) for param in model.GetParams(): gradient_checker.NetGradientChecker.Check( model.net, outputs_with_grad=[fake_loss], input_values=input_values, input_to_check=str(param), print_net=False, step_size=0.0001, threshold=0.05, )
def test_convolution_sync(self, net_type, num_workers, do, engine): from caffe2.python.cnn import CNNModelHelper m = CNNModelHelper() n = 1 d = 2 depth = 3 iters = 5 h = 5 w = 5 workspace.ResetWorkspace() np.random.seed(1701) # Build a binary tree of conv layers, summing at each node. for i in reversed(range(depth)): for j in range(2**i): bottom_1 = "{}_{}".format(i + 1, 2 * j) bottom_2 = "{}_{}".format(i + 1, 2 * j + 1) mid_1 = "{}_{}_m".format(i + 1, 2 * j) mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1) top = "{}_{}".format(i, j) w1, b1, w2, b2 = np.random.randn(4).tolist() m.Conv(bottom_1, mid_1, dim_in=d, dim_out=d, kernel=3, weight_init=m.ConstantInit(w1), bias_init=m.ConstantInit(b1), cudnn_state=np.random.randint(0, 3), stride=1, pad=1, deterministic=1, engine=engine) m.Conv(bottom_2, mid_2, dim_in=d, dim_out=d, kernel=3, stride=1, pad=1, weight_init=m.ConstantInit(w2), bias_init=m.ConstantInit(b2), deterministic=1, cudnn_state=np.random.randint(0, 3), engine=engine) m.net.Sum([mid_1, mid_2], top) m.net.Flatten(["0_0"], ["0_0_flat"]) m.net.SquaredL2Distance(["0_0_flat", "label"], "xent") m.net.AveragedLoss("xent", "loss") input_to_grad = m.AddGradientOperators(["loss"]) m.Proto().device_option.CopyFrom(do) m.param_init_net.Proto().device_option.CopyFrom(do) m.Proto().type = net_type m.Proto().num_workers = num_workers self.ws.run(m.param_init_net) def run(): import numpy as np np.random.seed(1701) input_blobs = ["{}_{}".format(depth, j) for j in range(2**depth)] for input_blob in input_blobs: self.ws.create_blob(input_blob).feed(np.random.randn( n, d, h, w).astype(np.float32), device_option=do) self.ws.create_blob("label").feed(np.random.randn( n, d * h * w).astype(np.float32), device_option=do) self.ws.run(m.net) gradients = [ self.ws.blobs[str(input_to_grad[input_blob])].fetch() for input_blob in input_blobs ] return gradients outputs = [run() for _ in range(iters)] for output in outputs[1:]: np.testing.assert_array_equal(outputs[0], output) np.testing.assert_allclose(np.sum(np.square(output)), 1763719461732352.0, rtol=1e-5)
def test_lstm_with_recurrent_attention( self, encoder_output_length, encoder_output_dim, decoder_input_length, decoder_state_dim, batch_size, gc, dc, ): with core.DeviceScope(gc): model = CNNModelHelper(name="external") ( encoder_outputs, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, ) = model.net.AddExternalInputs( "encoder_outputs", "decoder_inputs", "decoder_input_lengths", "initial_decoder_hidden_state", "initial_decoder_cell_state", "initial_attention_weighted_encoder_context", ) recurrent.LSTMWithAttention( model=model, decoder_inputs=decoder_inputs, decoder_input_lengths=decoder_input_lengths, initial_decoder_hidden_state=initial_decoder_hidden_state, initial_decoder_cell_state=initial_decoder_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=decoder_state_dim, decoder_state_dim=decoder_state_dim, scope='external/LSTMWithAttention', attention_type=AttentionType.Recurrent) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) # This is original decoder_inputs after linear layer decoder_input_blob = op.input[0] workspace.FeedBlob( decoder_input_blob, np.random.randn( decoder_input_length, batch_size, decoder_state_dim * 4, ).astype(np.float32)) workspace.FeedBlob( "external/LSTMWithAttention/encoder_outputs_transposed", np.random.randn( batch_size, encoder_output_dim, encoder_output_length, ).astype(np.float32), ) workspace.FeedBlob( "external/LSTMWithAttention/weighted_encoder_outputs", np.random.randn( encoder_output_length, batch_size, encoder_output_dim, ).astype(np.float32), ) workspace.FeedBlob( decoder_input_lengths, np.random.randint(0, decoder_input_length + 1, size=(batch_size, )).astype(np.int32)) workspace.FeedBlob( initial_decoder_hidden_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_decoder_cell_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_attention_weighted_encoder_context, np.random.randn(1, batch_size, encoder_output_dim).astype(np.float32)) inputs = [workspace.FetchBlob(name) for name in op.input] self.assertReferenceChecks( device_option=gc, op=op, inputs=inputs, reference=lstm_with_recurrent_attention_reference, grad_reference=None, output_to_grad=None, outputs_to_check=range(6), ) gradients_to_check = [ index for (index, input_name) in enumerate(op.input) if input_name != "decoder_input_lengths" ] for param in gradients_to_check: self.assertGradientChecks( device_option=gc, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=[0, 4], threshold=0.01, stepsize=0.001, )
def test_mul_rnn(self, T, n, d): model = CNNModelHelper(name='external') one_blob = model.param_init_net.ConstantFill( [], value=1.0, shape=[1, n, d]) input_blob = model.net.AddExternalInput('input') step = ModelHelperBase(name='step', param_model=model) input_t, output_t_prev = step.net.AddExternalInput( 'input_t', 'output_t_prev') output_t = step.net.Mul([input_t, output_t_prev]) step.net.AddExternalOutput(output_t) recurrent.recurrent_net( net=model.net, cell_net=step.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[(output_t_prev, one_blob)], links={output_t_prev: output_t}, scope="test_mul_rnn", ) workspace.FeedBlob( str(input_blob), np.random.randn(T, n, d).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) op = model.net._net.op[-1] def reference(input, initial_input): recurrent_input = initial_input result = np.zeros(shape=input.shape) for t_cur in range(T): recurrent_input = recurrent_input * input[t_cur] result[t_cur] = recurrent_input shape = list(input.shape) shape[0] = 1 return (result, result[-1].reshape(shape)) def grad_reference(output_grad, ref_output, inputs): input = inputs[0] output = ref_output[0] initial_input = inputs[1] input_grad = np.zeros(shape=input.shape) right_grad = 0 for t_cur in range(T - 1, -1, -1): prev_output = output[t_cur - 1] if t_cur > 0 else initial_input input_grad[t_cur] = (output_grad[t_cur] + right_grad) * prev_output right_grad = input[t_cur] * (output_grad[t_cur] + right_grad) return (input_grad, right_grad.reshape([1, n, d])) self.assertReferenceChecks( device_option=hu.cpu_do, op=op, inputs=[ workspace.FetchBlob(name) for name in [input_blob, one_blob] ], reference=reference, grad_reference=grad_reference, output_to_grad=op.output[0], outputs_to_check=[0, 1], )
def test_stateful_convolution_forward_only( self, sequence_length, conv_window, batch_size, state_size, ): ''' This unit test demonstrates another ways of using RecurrentNetwork. Imagine, that you want to compute convolution over a sequence, but sequence elements are not given to you from the beginning, so you have to loop over the sequence and compute convolution for each element separately. This situation can occur, during inference/generation step of the neural networks. First of all, you have to provide actual input via recurrent states, since the input of RecurrentNetwork should be known in advance. Here, we use `fake_inputs` as the input, and it's used by the op to extract batch size and sequence length. The actual input sequence is stored in the recurrent state `input_state`. At every step we generate a new element via input_state_t (in this example, input_state_t is generated at random, but in a real situation it can be created using convolution output from the previous step). A few important differences from regular RecurrentNetwork usecase: 1. input_state_t_prev is not only a single previous element of input_state sequence. It is last conv_window elements including (!) the current one - input_state_t. We specify that using `link_window` argument of RecurrentNetwork. We need that many elements to compute a single convolution step. Also, note that `link_window` specifies how many element to link starting at `timestep` + `link_offset` position. 2. First few steps might require additional zero padding from the left, since there is no enough element of input_state sequence are available. So the initial_state for input_state contains several elements (exactly how many pads we need for the first step). Also, because of that all offseting over input_state sequnece is being shifted by length of initial_input_state: see `link_offset` and `alias_offset` arguments of RecurrentNetwork. In this test, we assert that we get the same result if we apply convolution over all elements simultaneously, since the whole input_state sequence was generated at the end. ''' model = CNNModelHelper(name='model') fake_inputs = model.param_init_net.UniformFill( [], 'fake_inputs', min=-1.0, max=1.0, shape=[sequence_length, batch_size, state_size], ) initial_input_state = model.param_init_net.ConstantFill( [], 'initial_input_state', value=0.0, shape=[conv_window - 1, batch_size, state_size], ) initial_output_state = model.param_init_net.ConstantFill( [], 'initial_output_state', value=0.0, shape=[1, batch_size, state_size], ) step_model = CNNModelHelper(name='step_model', param_model=model) ( fake_input_t, timestep, input_state_t_prev, ) = step_model.net.AddExternalInputs( 'fake_input_t', 'timestep', 'input_state_t_prev', ) conv_filter = step_model.param_init_net.XavierFill( [], 'conv_filter', shape=[state_size, 1, conv_window, state_size], ) conv_bias = step_model.param_init_net.ConstantFill( [], 'conv_bias', shape=[state_size], value=0.0, ) step_model.params.extend([conv_filter, conv_bias]) input_state_t = step_model.net.UniformFill( [], 'input_state_t', min=-1.0, max=1.0, shape=[1, batch_size, state_size], ) output_state_t = self._convolution_1d( model=step_model, inputs=input_state_t_prev, conv_window=conv_window, conv_filter=conv_filter, conv_bias=conv_bias, output_name='output_state_t', left_pad=False, ) initial_recurrent_states = [initial_input_state, initial_output_state] all_inputs = ([fake_inputs] + step_model.params + initial_recurrent_states) all_outputs = ['input_state_all', 'output_state_all'] recurrent_states = ['input_state', 'output_state'] input_state_all, output_state_all, _ = model.net.RecurrentNetwork( all_inputs, all_outputs + ['step_workspaces'], param=map(all_inputs.index, step_model.params), alias_src=recurrent_states, alias_dst=all_outputs, alias_offset=[conv_window - 1, 1], recurrent_states=recurrent_states, initial_recurrent_state_ids=map( all_inputs.index, initial_recurrent_states, ), link_internal=map( str, [input_state_t_prev, input_state_t, output_state_t], ), link_external=['input_state', 'input_state', 'output_state'], link_offset=[0, conv_window - 1, 1], link_window=[conv_window, 1, 1], backward_link_internal=[], backward_link_external=[], backward_link_offset=[], step_net=str(step_model.net.Proto()), backward_step_net='', timestep='timestep' if timestep is None else str(timestep), outputs_with_grads=[], ) output_states_2 = self._convolution_1d( model=model, inputs=input_state_all, conv_window=conv_window, conv_filter=conv_filter, conv_bias=conv_bias, output_name='output_states_2', left_pad=True, ) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) np.testing.assert_almost_equal( workspace.FetchBlob(output_state_all), workspace.FetchBlob(output_states_2), decimal=3, )
def lstm_base(self, lstm_type, outputs_with_grads, memory_optim, input_tensor, forget_bias, fwd_only, drop_states): print("LSTM test parameters: ", locals()) create_lstm, ref = lstm_type t, n, d = input_tensor.shape assert d % 4 == 0 d = d // 4 print("Dims: ", t, n, d) ref = partial(ref, forget_bias=forget_bias, drop_states=drop_states) model = CNNModelHelper(name='external') input_blob, seq_lengths, hidden_init, cell_init = ( model.net.AddExternalInputs( 'input_blob', 'seq_lengths', 'hidden_init', 'cell_init')) create_lstm( model, input_blob, seq_lengths, (hidden_init, cell_init), d, d, scope="external/recurrent", outputs_with_grads=outputs_with_grads, memory_optimization=memory_optim, forget_bias=forget_bias, forward_only=fwd_only, drop_states=drop_states, ) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) input_blob = op.input[0] def generate_random_state(n, d): ndim = int(np.random.choice(3, 1)) + 1 if ndim == 1: return np.random.randn(1, n, d).astype(np.float32) random_state = np.random.randn(n, d).astype(np.float32) if ndim == 3: random_state = random_state.reshape([1, n, d]) return random_state workspace.FeedBlob( str(input_blob), np.random.randn(t, n, d * 4).astype(np.float32)) workspace.FeedBlob("hidden_init", generate_random_state(n, d)) workspace.FeedBlob("cell_init", generate_random_state(n, d)) workspace.FeedBlob( "seq_lengths", np.random.randint(1, t + 1, size=(n,)).astype(np.int32) ) inputs = [workspace.FetchBlob(name) for name in op.input] self.assertReferenceChecks( hu.cpu_do, op, inputs, ref, outputs_to_check=range(4), ) # Checking for input, gates_t_w and gates_t_b gradients if not fwd_only: for param in range(5): self.assertGradientChecks( device_option=hu.cpu_do, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=outputs_with_grads, threshold=0.01, stepsize=0.005, )