word_indices = T.ivector() cnn_features = T.fvector() true_dist = T.ivector() word_embedings = word_embedding_layer.get_output_expr(word_indices) cnn_embedings = cnn_embedding_layer.get_output_expr(cnn_features) embedings = row_stack_layer.get_output_expr(cnn_embedings, word_embedings) masked_embedings = embedding_dropout_layer.get_output_expr(embedings) h = lstm_layer.get_output_expr(masked_embedings) masked_h = hidden_states_dropout_layer.get_output_expr(h[1:]) unnormalized_probs = pre_softmax_layer.get_output_expr(masked_h) probs = softmax_layer.get_output_expr(unnormalized_probs) loss = T.mean(T.nnet.categorical_crossentropy(probs, true_dist)) updates = get_nesterov_momentum_updates(loss_expr=loss, dense_parameters=cnn_embedding_layer.get_parameters() + \ row_stack_layer.get_parameters() + \ embedding_dropout_layer.get_parameters() + \ lstm_layer.get_parameters() + \ hidden_states_dropout_layer.get_parameters() + \ pre_softmax_layer.get_parameters() + \ softmax_layer.get_parameters(), sparse_parameters=word_embedding_layer.get_parameters(), learning_rate=learning_rate, momentum=0.9) # compile model training function cnn_features_idx = T.iscalar() caption_begin = T.iscalar() caption_end = T.iscalar() train_model = theano.function( inputs=[cnn_features_idx, caption_begin, caption_end], outputs=loss,
word_indices = T.ivector() cnn_features = T.fvector() true_dist = T.ivector() word_embedings = word_embedding_layer.get_output_expr(word_indices) cnn_embedings = cnn_embedding_layer.get_output_expr(cnn_features) embedings = row_stack_layer.get_output_expr(cnn_embedings, word_embedings) masked_embedings = embedding_dropout_layer.get_output_expr(embedings) h = lstm_layer.get_output_expr(masked_embedings) masked_h = hidden_states_dropout_layer.get_output_expr(h[1:]) unnormalized_probs = pre_softmax_layer.get_output_expr(masked_h) probs = softmax_layer.get_output_expr(unnormalized_probs) loss = T.mean(T.nnet.categorical_crossentropy(probs, true_dist)) updates = get_nesterov_momentum_updates(loss_expr=loss, dense_parameters=cnn_embedding_layer.get_parameters() + \ row_stack_layer.get_parameters() + \ embedding_dropout_layer.get_parameters() + \ lstm_layer.get_parameters() + \ hidden_states_dropout_layer.get_parameters() + \ pre_softmax_layer.get_parameters() + \ softmax_layer.get_parameters(), sparse_parameters=word_embedding_layer.get_parameters(), learning_rate=learning_rate, momentum=0.9) # compile model training function cnn_features_idx = T.iscalar() caption_begin = T.iscalar() caption_end = T.iscalar() train_model = theano.function( inputs=[cnn_features_idx, caption_begin, caption_end],