def baseline_histogram_density(x, parameters): h = layers.fc_layer(x, number_of_units = 100) y_prediction_density = layers.softmax_layer(h, number_of_outputs = 4) return y_prediction_density
def generator(source, target, sequence_length, vocab_size, decoder_fn=None, **opts): """ Args: source: TensorFlow queue or placeholder tensor for word ids for source target: TensorFlow queue or placeholder tensor for word ids for target sequence_length: TensorFlow queue or placeholder tensor for number of word ids for each sentence vocab_size: max vocab size determined from data decoder_fn: if using custom decoder_fn else use the default dynamic_rnn """ tf.logging.info(" Setting up generator") embedding_layer = lay.embedding_layer(vocab_size, opts["embedding_dim"], name="embedding_matrix") # TODO: add batch norm? rnn_outputs = (source >> embedding_layer >> lay.word_dropout_layer( keep_prob=opts["word_dropout_keep_prob"]) >> lay.recurrent_layer( hidden_dims=opts["rnn_hidden_dim"], keep_prob=opts["recurrent_dropout_keep_prob"], sequence_length=sequence_length, decoder_fn=decoder_fn, name="rnn_cell")) output_projection_layer = lay.dense_layer(hidden_dims=vocab_size, name="output_projections") flat_logits = (rnn_outputs >> lay.reshape_layer( shape=(-1, opts["rnn_hidden_dim"])) >> output_projection_layer) probs = flat_logits >> lay.softmax_layer() embedding_matrix = embedding_layer.get_variables_in_scope() output_projections = output_projection_layer.get_variables_in_scope() if decoder_fn is not None: return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=None, embedding_matrix=embedding_matrix[0], output_projections=output_projections) loss = (flat_logits >> lay.cross_entropy_layer(target=target) >> lay.reshape_layer(shape=tf.shape(target)) >> lay.mean_loss_by_example_layer(sequence_length=sequence_length)) # TODO: add dropout penalty return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=loss, embedding_matrix=embedding_matrix[0], output_projections=output_projections)
def generator(source, target, sequence_length, vocab_size, decoder_fn=None, **opts): """ Args: source: TensorFlow queue or placeholder tensor for word ids for source target: TensorFlow queue or placeholder tensor for word ids for target sequence_length: TensorFlow queue or placeholder tensor for number of word ids for each sentence vocab_size: max vocab size determined from data decoder_fn: if using custom decoder_fn else use the default dynamic_rnn """ tf.logging.info(" Setting up generator") embedding_layer = lay.embedding_layer(vocab_size, opts["embedding_dim"], name="embedding_matrix") # TODO: add batch norm? rnn_outputs = ( source >> embedding_layer >> lay.word_dropout_layer(keep_prob=opts["word_dropout_keep_prob"]) >> lay.recurrent_layer(hidden_dims=opts["rnn_hidden_dim"], keep_prob=opts["recurrent_dropout_keep_prob"], sequence_length=sequence_length, decoder_fn=decoder_fn, name="rnn_cell") ) output_projection_layer = lay.dense_layer(hidden_dims=vocab_size, name="output_projections") flat_logits = ( rnn_outputs >> lay.reshape_layer(shape=(-1, opts["rnn_hidden_dim"])) >> output_projection_layer ) probs = flat_logits >> lay.softmax_layer() embedding_matrix = embedding_layer.get_variables_in_scope() output_projections = output_projection_layer.get_variables_in_scope() if decoder_fn is not None: return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=None, embedding_matrix=embedding_matrix[0], output_projections=output_projections) loss = ( flat_logits >> lay.cross_entropy_layer(target=target) >> lay.reshape_layer(shape=tf.shape(target)) >> lay.mean_loss_by_example_layer(sequence_length=sequence_length) ) # TODO: add dropout penalty return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=loss, embedding_matrix=embedding_matrix[0], output_projections=output_projections)
def baseline(x, parameters, nodropout_probability = None, Gaussian_noise_std = None): if Gaussian_noise_std is not None: x = layers.all_views_Gaussian_noise_layer(x, Gaussian_noise_std) # first conv sequence h = layers.all_views_conv_layer(x, 'conv1', number_of_filters = 32, filter_size = [3, 3], stride = [2, 2]) # second conv sequence h = layers.all_views_max_pool(h, stride = [3, 3]) h = layers.all_views_conv_layer(h, 'conv2a', number_of_filters = 64, filter_size = [3, 3], stride=[2, 2]) h = layers.all_views_conv_layer(h, 'conv2b', number_of_filters = 64, filter_size = [3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv2c', number_of_filters = 64, filter_size = [3, 3], stride=[1, 1]) # third conv sequence next_sequence = True h = layers.all_views_max_pool(h, stride = [2, 2]) h = layers.all_views_conv_layer(h, 'conv3a', number_of_filters = 128, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_conv_layer(h, 'conv3b', number_of_filters = 128, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_conv_layer(h, 'conv3c', number_of_filters = 128, filter_size = [3, 3], stride = [1, 1]) # fourth conv sequence next_sequence = True h = layers.all_views_max_pool(h, stride = [2, 2]) h = layers.all_views_conv_layer(h, 'conv4a', number_of_filters = 128, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_conv_layer(h, 'conv4b', number_of_filters = 128, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_conv_layer(h, 'conv4c', number_of_filters = 128, filter_size = [3, 3], stride = [1, 1]) # fifth conv sequence next_sequence = True h = layers.all_views_max_pool(h, stride = [2, 2]) h = layers.all_views_conv_layer(h, 'conv5a', number_of_filters = 256, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_conv_layer(h, 'conv5b', number_of_filters = 256, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_conv_layer(h, 'conv5c', number_of_filters = 256, filter_size = [3, 3], stride = [1, 1]) h = layers.all_views_global_avg_pool(h) h = layers.all_views_flattening_layer(h) h = layers.fc_layer(h, number_of_units = 4 * 256) h = layers.dropout_layer(h, nodropout_probability) y_prediction_density = layers.softmax_layer(h, number_of_outputs = 4) return y_prediction_density
def baseline(x, parameters, nodropout_probability=None, gaussian_noise_std=None): if gaussian_noise_std is not None: x = layers.all_views_gaussian_noise_layer(x, gaussian_noise_std) # first conv sequence h = layers.all_views_conv_layer(x, 'conv1', number_of_filters=32, filter_size=[3, 3], stride=[2, 2]) # second conv sequence h = layers.all_views_max_pool(h, stride=[3, 3]) h = layers.all_views_conv_layer(h, 'conv2a', number_of_filters=64, filter_size=[3, 3], stride=[2, 2]) h = layers.all_views_conv_layer(h, 'conv2b', number_of_filters=64, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv2c', number_of_filters=64, filter_size=[3, 3], stride=[1, 1]) # third conv sequence h = layers.all_views_max_pool(h, stride=[2, 2]) h = layers.all_views_conv_layer(h, 'conv3a', number_of_filters=128, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv3b', number_of_filters=128, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv3c', number_of_filters=128, filter_size=[3, 3], stride=[1, 1]) # fourth conv sequence h = layers.all_views_max_pool(h, stride=[2, 2]) h = layers.all_views_conv_layer(h, 'conv4a', number_of_filters=128, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv4b', number_of_filters=128, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv4c', number_of_filters=128, filter_size=[3, 3], stride=[1, 1]) # fifth conv sequence h = layers.all_views_max_pool(h, stride=[2, 2]) h = layers.all_views_conv_layer(h, 'conv5a', number_of_filters=256, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv5b', number_of_filters=256, filter_size=[3, 3], stride=[1, 1]) h = layers.all_views_conv_layer(h, 'conv5c', number_of_filters=256, filter_size=[3, 3], stride=[1, 1]) # Pool, flatten, and fully connected layers h = layers.all_views_global_avg_pool(h) h = layers.all_views_flattening_layer(h) #flatening and concatenation h = layers.fc_layer(h, number_of_units=1024) #h = layers.dropout_layer(h, nodropout_probability) y_prediction_birads = layers.softmax_layer(h, number_of_outputs=3) print(y_prediction_birads) return y_prediction_birads
def gen_decoder(name, z, n_hidden, n_output, keep_prob, reuse=False,\ output_zero_one=True,output_scalar=False,output_softmax=False): with tf.variable_scope("%s_bernoulli_decoder" % name, reuse=reuse): if type(n_hidden) is int: n_hidden = [z.get_shape()[1], n_hidden] elif type(n_hidden) is list: n_hidden = [z.get_shape()[1]] + n_hidden else: raise ("type of n_hidden needs to be int or list") num_layers = len(n_hidden) #h = layers.tanh_layer(x, x.get_shape()[0], n_hidden[0], # "enc_l0", reuse, True, keep_prob) h = z for i in range(num_layers - 1): h = layers.tanh_layer(h, n_hidden[i], n_hidden[i + 1], "dec_l%i" % i, reuse, True, keep_prob) if output_zero_one: h = layers.sigmoid_layer(h, n_hidden[-1], n_output, "dec_l%i" % (num_layers - 1), reuse, True, keep_prob) elif output_scalar: h = layers.linear_layer(h, n_hidden[-1], n_output, "dec_l%i" % (num_layers - 1), reuse, True, keep_prob) elif output_softmax: h = layers.softmax_layer(h, n_hidden[-1], n_output, "dec_l%i" % (num_layers - 1), reuse, True, keep_prob) else: print("Not reachable") raise ("wtf") return h
def run(): model_name = 'alexnet' directory_caffe = './caffemodel' directory_theano = './theanomodel' url_prototxt = 'https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_alexnet/deploy.prototxt' url_caffemodel = 'http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel' filename_prototxt = '%s/%s.prototxt' % (directory_caffe, model_name) filename_caffemodel = '%s/%s.caffemodel' % (directory_caffe, model_name) filename_theanomodel = '%s/%s.model' % (directory_theano, model_name) # download caffemodel print 'downloading caffemodel' if not os.path.exists(directory_caffe): os.mkdir(directory_caffe) if not os.path.exists(filename_prototxt): p = subprocess.Popen(('wget', url_prototxt, '-O', filename_prototxt)) p.wait() if not os.path.exists(filename_caffemodel): p = subprocess.Popen(( 'wget', url_caffemodel, '-O', filename_caffemodel, )) p.wait() # load caffe model print 'loading caffe model' model_caffe = caffe.Net(filename_prototxt, filename_caffemodel, True) conv1_W = theano.shared(model_caffe.params['conv1'][0].data[:, :, ::-1, ::-1]) conv2_W = theano.shared(model_caffe.params['conv2'][0].data[:, :, ::-1, ::-1]) conv3_W = theano.shared(model_caffe.params['conv3'][0].data[:, :, ::-1, ::-1]) conv4_W = theano.shared(model_caffe.params['conv4'][0].data[:, :, ::-1, ::-1]) conv5_W = theano.shared(model_caffe.params['conv5'][0].data[:, :, ::-1, ::-1]) conv1_b = theano.shared(model_caffe.params['conv1'][1].data.squeeze()) conv2_b = theano.shared(model_caffe.params['conv2'][1].data.squeeze()) conv3_b = theano.shared(model_caffe.params['conv3'][1].data.squeeze()) conv4_b = theano.shared(model_caffe.params['conv4'][1].data.squeeze()) conv5_b = theano.shared(model_caffe.params['conv5'][1].data.squeeze()) fc6_W = theano.shared(model_caffe.params['fc6'][0].data.squeeze()) fc7_W = theano.shared(model_caffe.params['fc7'][0].data.squeeze()) fc8_W = theano.shared(model_caffe.params['fc8'][0].data.squeeze()) fc6_b = theano.shared(model_caffe.params['fc6'][1].data.squeeze()) fc7_b = theano.shared(model_caffe.params['fc7'][1].data.squeeze()) fc8_b = theano.shared(model_caffe.params['fc8'][1].data.squeeze()) # make theano model print 'building theano model' model_theano = collections.OrderedDict() model_theano['data'] = T.tensor4() model_theano['conv1'] = layers.convolution_layer(model_theano['data'], conv1_W, conv1_b, subsample=(4, 4)) model_theano['relu1'] = layers.relu_layer(model_theano['conv1']) model_theano['norm1'] = layers.lrn_layer(model_theano['relu1']) model_theano['pool1'] = layers.pooling_layer(model_theano['norm1']) model_theano['conv2'] = layers.convolution_layer(model_theano['pool1'], conv2_W, conv2_b, border='same', group=2) model_theano['relu2'] = layers.relu_layer(model_theano['conv2']) model_theano['norm2'] = layers.lrn_layer(model_theano['relu2']) model_theano['pool2'] = layers.pooling_layer(model_theano['norm2']) model_theano['conv3'] = layers.convolution_layer(model_theano['pool2'], conv3_W, conv3_b, border='same') model_theano['relu3'] = layers.relu_layer(model_theano['conv3']) model_theano['conv4'] = layers.convolution_layer(model_theano['relu3'], conv4_W, conv4_b, border='same', group=2) model_theano['relu4'] = layers.relu_layer(model_theano['conv4']) model_theano['conv5'] = layers.convolution_layer(model_theano['relu4'], conv5_W, conv5_b, border='same', group=2) model_theano['relu5'] = layers.relu_layer(model_theano['conv5']) model_theano['pool5'] = layers.pooling_layer(model_theano['relu5']) model_theano['fc6'] = layers.inner_product_layer(model_theano['pool5'], fc6_W, fc6_b) model_theano['relu6'] = layers.relu_layer(model_theano['fc6']) model_theano['fc7'] = layers.inner_product_layer(model_theano['relu6'], fc7_W, fc7_b) model_theano['relu7'] = layers.relu_layer(model_theano['fc7']) model_theano['fc8'] = layers.inner_product_layer(model_theano['relu7'], fc8_W, fc8_b) model_theano['prob'] = layers.softmax_layer(model_theano['fc8']) # check print 'checking model' data = np.random.randn(*model_caffe.blobs['data'].data.shape) data = data.astype(np.float32) * 10 model_caffe.blobs['data'].data[:] = data model_caffe.forward() theano_output = theano.function( [model_theano['data']], model_theano['prob'], )(data) error = ( theano_output.squeeze() - model_caffe.blobs['prob'].data.squeeze() ).max() assert error < 1e-6 # save print 'saving' if not os.path.exists(directory_theano): os.mkdir(directory_theano) sys.setrecursionlimit(100000) pickle.dump( model_theano, open(filename_theanomodel, 'wb'), protocol=pickle.HIGHEST_PROTOCOL, ) print 'done'
def loss(self, X, y=None, hprev=None, softmax=True): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - X: Input sentence, shape (N, T), T is the max length of sequence it's padded to. Each element is in the range 0 <= y[i, t] < V. N is the batch number. We need to trim it down to (N, TS). - y: labels for sentences, (N,) - hprev: (N, H), initial hidden state, when None, it will be initialized to all 0 - softmax: bool, Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ mode = "test" if y is None else "train" # we need to clean out self.updates, if loss() is called again if len(self.updates) != 0: self.updates = [] # affine transforming last hidden state to softmax # hidden state W_proj: (input_dim, hidden_dim) = (D, H) W_proj, b_proj = self.params["W_proj"], self.params["b_proj"] # Word embedding matrix W_embed = self.params["W_embed"] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params["Wx"], self.params["Wh"], self.params["b"] H = self.hidden_dim N = self.batch_size TS = self.max_seq_length loss, grads = 0.0, {} ############################################################################ # Implement the forward passes for the SentimentRNN. # # In the forward pass it does the following: # # # # (0) We trim X down to TS max-sequence # # (1) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (2) initial hidden state is initialized at zero (wildML, theano official)# # (N, H) # # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to # # process the sequence of input word vectors and produce hidden state # # vectors for all timesteps, producing an array of shape (N, T, H). # # (4) transform the last hidden state with affine transformation # # (5) Use Softmax to produce a label for the sentence # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # ===== forward pass ===== # step (0) CUT X DOWN TO TIMESTEP # X (N, T) X = X[:, :TS] # step (1) # X (N, TS) out_word_embedded = word_embedding_forward(X, W_embed) # word embedding (N, TS, W) # step (2) if hprev is None: hprev = np.zeros((N, H), dtype=self.dtype) # hprev (N, H) # step (3) hs = None h_states_shapes = (N, TS, H) if self.cell_type == "rnn": hs = rnn_forward(out_word_embedded, hprev, Wx, Wh, b, h_states_shapes) # last_h : (N, H) elif self.cell_type == "lstm": hs = lstm_forward(out_word_embedded, hprev, Wx, Wh, b, h_states_shapes) last_hs = hs[:, -1, :] # step (4) # last_hs: (N, H), W_proj: (H, H), b_proj: (H,) # it shares the same dimensionality of hidden dimension of RNN out_aff = affine_layer(last_hs, W_proj, b_proj) # step (5) probs = softmax_layer(out_aff) if y is None: out = probs else: out = -T.mean(T.log(probs)[T.arange(y.shape[0]), y]) ############################################################################ # END OF YOUR CODE # ############################################################################ return out
def run(): model_name = 'alexnet' directory_caffe = './caffemodel' directory_theano = './theanomodel' url_prototxt = 'https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_alexnet/deploy.prototxt' url_caffemodel = 'http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel' filename_prototxt = '%s/%s.prototxt' % (directory_caffe, model_name) filename_caffemodel = '%s/%s.caffemodel' % (directory_caffe, model_name) filename_theanomodel = '%s/%s.model' % (directory_theano, model_name) # download caffemodel print 'downloading caffemodel' if not os.path.exists(directory_caffe): os.mkdir(directory_caffe) if not os.path.exists(filename_prototxt): p = subprocess.Popen(('wget', url_prototxt, '-O', filename_prototxt)) p.wait() if not os.path.exists(filename_caffemodel): p = subprocess.Popen(( 'wget', url_caffemodel, '-O', filename_caffemodel, )) p.wait() # load caffe model print 'loading caffe model' model_caffe = caffe.Net(filename_prototxt, filename_caffemodel, True) conv1_W = theano.shared( model_caffe.params['conv1'][0].data[:, :, ::-1, ::-1]) conv2_W = theano.shared( model_caffe.params['conv2'][0].data[:, :, ::-1, ::-1]) conv3_W = theano.shared( model_caffe.params['conv3'][0].data[:, :, ::-1, ::-1]) conv4_W = theano.shared( model_caffe.params['conv4'][0].data[:, :, ::-1, ::-1]) conv5_W = theano.shared( model_caffe.params['conv5'][0].data[:, :, ::-1, ::-1]) conv1_b = theano.shared(model_caffe.params['conv1'][1].data.squeeze()) conv2_b = theano.shared(model_caffe.params['conv2'][1].data.squeeze()) conv3_b = theano.shared(model_caffe.params['conv3'][1].data.squeeze()) conv4_b = theano.shared(model_caffe.params['conv4'][1].data.squeeze()) conv5_b = theano.shared(model_caffe.params['conv5'][1].data.squeeze()) fc6_W = theano.shared(model_caffe.params['fc6'][0].data.squeeze()) fc7_W = theano.shared(model_caffe.params['fc7'][0].data.squeeze()) fc8_W = theano.shared(model_caffe.params['fc8'][0].data.squeeze()) fc6_b = theano.shared(model_caffe.params['fc6'][1].data.squeeze()) fc7_b = theano.shared(model_caffe.params['fc7'][1].data.squeeze()) fc8_b = theano.shared(model_caffe.params['fc8'][1].data.squeeze()) # make theano model print 'building theano model' model_theano = collections.OrderedDict() model_theano['data'] = T.tensor4() model_theano['conv1'] = layers.convolution_layer(model_theano['data'], conv1_W, conv1_b, subsample=(4, 4)) model_theano['relu1'] = layers.relu_layer(model_theano['conv1']) model_theano['norm1'] = layers.lrn_layer(model_theano['relu1']) model_theano['pool1'] = layers.pooling_layer(model_theano['norm1']) model_theano['conv2'] = layers.convolution_layer(model_theano['pool1'], conv2_W, conv2_b, border='same', group=2) model_theano['relu2'] = layers.relu_layer(model_theano['conv2']) model_theano['norm2'] = layers.lrn_layer(model_theano['relu2']) model_theano['pool2'] = layers.pooling_layer(model_theano['norm2']) model_theano['conv3'] = layers.convolution_layer(model_theano['pool2'], conv3_W, conv3_b, border='same') model_theano['relu3'] = layers.relu_layer(model_theano['conv3']) model_theano['conv4'] = layers.convolution_layer(model_theano['relu3'], conv4_W, conv4_b, border='same', group=2) model_theano['relu4'] = layers.relu_layer(model_theano['conv4']) model_theano['conv5'] = layers.convolution_layer(model_theano['relu4'], conv5_W, conv5_b, border='same', group=2) model_theano['relu5'] = layers.relu_layer(model_theano['conv5']) model_theano['pool5'] = layers.pooling_layer(model_theano['relu5']) model_theano['fc6'] = layers.inner_product_layer(model_theano['pool5'], fc6_W, fc6_b) model_theano['relu6'] = layers.relu_layer(model_theano['fc6']) model_theano['fc7'] = layers.inner_product_layer(model_theano['relu6'], fc7_W, fc7_b) model_theano['relu7'] = layers.relu_layer(model_theano['fc7']) model_theano['fc8'] = layers.inner_product_layer(model_theano['relu7'], fc8_W, fc8_b) model_theano['prob'] = layers.softmax_layer(model_theano['fc8']) # check print 'checking model' data = np.random.randn(*model_caffe.blobs['data'].data.shape) data = data.astype(np.float32) * 10 model_caffe.blobs['data'].data[:] = data model_caffe.forward() theano_output = theano.function( [model_theano['data']], model_theano['prob'], )(data) error = (theano_output.squeeze() - model_caffe.blobs['prob'].data.squeeze()).max() assert error < 1e-6 # save print 'saving' if not os.path.exists(directory_theano): os.mkdir(directory_theano) sys.setrecursionlimit(100000) pickle.dump( model_theano, open(filename_theanomodel, 'wb'), protocol=pickle.HIGHEST_PROTOCOL, ) print 'done'
def run(): model_name = "vggnet" directory_caffe = "./caffemodel" directory_theano = "./theanomodel" url_prototxt = "https://gist.githubusercontent.com/ksimonyan/3785162f95cd2d5fee77/raw/f02f8769e64494bcd3d7e97d5d747ac275825721/VGG_ILSVRC_19_layers_deploy.prototxt" url_caffemodel = "http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_19_layers.caffemodel" filename_prototxt = "%s/%s.prototxt" % (directory_caffe, model_name) filename_caffemodel = "%s/%s.caffemodel" % (directory_caffe, model_name) filename_theanomodel = "%s/%s.model" % (directory_theano, model_name) # download caffemodel print "downloading caffemodel" if not os.path.exists(directory_caffe): os.mkdir(directory_caffe) if not os.path.exists(filename_prototxt): p = subprocess.Popen(("wget", url_prototxt, "-O", filename_prototxt)) p.wait() if not os.path.exists(filename_caffemodel): p = subprocess.Popen(("wget", url_caffemodel, "-O", filename_caffemodel)) p.wait() # load caffe model model_caffe = caffe.Net(filename_prototxt, filename_caffemodel, True) conv1_1_W = theano.shared(model_caffe.params["conv1_1"][0].data[:, :, ::-1, ::-1]) conv1_2_W = theano.shared(model_caffe.params["conv1_2"][0].data[:, :, ::-1, ::-1]) conv2_1_W = theano.shared(model_caffe.params["conv2_1"][0].data[:, :, ::-1, ::-1]) conv2_2_W = theano.shared(model_caffe.params["conv2_2"][0].data[:, :, ::-1, ::-1]) conv3_1_W = theano.shared(model_caffe.params["conv3_1"][0].data[:, :, ::-1, ::-1]) conv3_2_W = theano.shared(model_caffe.params["conv3_2"][0].data[:, :, ::-1, ::-1]) conv3_3_W = theano.shared(model_caffe.params["conv3_3"][0].data[:, :, ::-1, ::-1]) conv3_4_W = theano.shared(model_caffe.params["conv3_4"][0].data[:, :, ::-1, ::-1]) conv4_1_W = theano.shared(model_caffe.params["conv4_1"][0].data[:, :, ::-1, ::-1]) conv4_2_W = theano.shared(model_caffe.params["conv4_2"][0].data[:, :, ::-1, ::-1]) conv4_3_W = theano.shared(model_caffe.params["conv4_3"][0].data[:, :, ::-1, ::-1]) conv4_4_W = theano.shared(model_caffe.params["conv4_4"][0].data[:, :, ::-1, ::-1]) conv5_1_W = theano.shared(model_caffe.params["conv5_1"][0].data[:, :, ::-1, ::-1]) conv5_2_W = theano.shared(model_caffe.params["conv5_2"][0].data[:, :, ::-1, ::-1]) conv5_3_W = theano.shared(model_caffe.params["conv5_3"][0].data[:, :, ::-1, ::-1]) conv5_4_W = theano.shared(model_caffe.params["conv5_4"][0].data[:, :, ::-1, ::-1]) conv1_1_b = theano.shared(model_caffe.params["conv1_1"][1].data.squeeze()) conv1_2_b = theano.shared(model_caffe.params["conv1_2"][1].data.squeeze()) conv2_1_b = theano.shared(model_caffe.params["conv2_1"][1].data.squeeze()) conv2_2_b = theano.shared(model_caffe.params["conv2_2"][1].data.squeeze()) conv3_1_b = theano.shared(model_caffe.params["conv3_1"][1].data.squeeze()) conv3_2_b = theano.shared(model_caffe.params["conv3_2"][1].data.squeeze()) conv3_3_b = theano.shared(model_caffe.params["conv3_3"][1].data.squeeze()) conv3_4_b = theano.shared(model_caffe.params["conv3_4"][1].data.squeeze()) conv4_1_b = theano.shared(model_caffe.params["conv4_1"][1].data.squeeze()) conv4_2_b = theano.shared(model_caffe.params["conv4_2"][1].data.squeeze()) conv4_3_b = theano.shared(model_caffe.params["conv4_3"][1].data.squeeze()) conv4_4_b = theano.shared(model_caffe.params["conv4_4"][1].data.squeeze()) conv5_1_b = theano.shared(model_caffe.params["conv5_1"][1].data.squeeze()) conv5_2_b = theano.shared(model_caffe.params["conv5_2"][1].data.squeeze()) conv5_3_b = theano.shared(model_caffe.params["conv5_3"][1].data.squeeze()) conv5_4_b = theano.shared(model_caffe.params["conv5_4"][1].data.squeeze()) fc6_W = theano.shared(model_caffe.params["fc6"][0].data.squeeze()) fc7_W = theano.shared(model_caffe.params["fc7"][0].data.squeeze()) fc8_W = theano.shared(model_caffe.params["fc8"][0].data.squeeze()) fc6_b = theano.shared(model_caffe.params["fc6"][1].data.squeeze()) fc7_b = theano.shared(model_caffe.params["fc7"][1].data.squeeze()) fc8_b = theano.shared(model_caffe.params["fc8"][1].data.squeeze()) # make theano model model_theano = collections.OrderedDict() model_theano["data"] = T.tensor4() model_theano["conv1_1"] = layers.convolution_layer(model_theano["data"], conv1_1_W, conv1_1_b, border="same") model_theano["relu1_1"] = layers.relu_layer(model_theano["conv1_1"]) model_theano["conv1_2"] = layers.convolution_layer(model_theano["relu1_1"], conv1_2_W, conv1_2_b, border="same") model_theano["relu1_2"] = layers.relu_layer(model_theano["conv1_2"]) model_theano["pool1"] = layers.pooling_layer(model_theano["relu1_2"], size=(2, 2), stride=(2, 2)) model_theano["conv2_1"] = layers.convolution_layer(model_theano["pool1"], conv2_1_W, conv2_1_b, border="same") model_theano["relu2_1"] = layers.relu_layer(model_theano["conv2_1"]) model_theano["conv2_2"] = layers.convolution_layer(model_theano["relu2_1"], conv2_2_W, conv2_2_b, border="same") model_theano["relu2_2"] = layers.relu_layer(model_theano["conv2_2"]) model_theano["pool2"] = layers.pooling_layer(model_theano["relu2_2"], size=(2, 2), stride=(2, 2)) model_theano["conv3_1"] = layers.convolution_layer(model_theano["pool2"], conv3_1_W, conv3_1_b, border="same") model_theano["relu3_1"] = layers.relu_layer(model_theano["conv3_1"]) model_theano["conv3_2"] = layers.convolution_layer(model_theano["relu3_1"], conv3_2_W, conv3_2_b, border="same") model_theano["relu3_2"] = layers.relu_layer(model_theano["conv3_2"]) model_theano["conv3_3"] = layers.convolution_layer(model_theano["relu3_2"], conv3_3_W, conv3_3_b, border="same") model_theano["relu3_3"] = layers.relu_layer(model_theano["conv3_3"]) model_theano["conv3_4"] = layers.convolution_layer(model_theano["relu3_3"], conv3_4_W, conv3_4_b, border="same") model_theano["relu3_4"] = layers.relu_layer(model_theano["conv3_4"]) model_theano["pool3"] = layers.pooling_layer(model_theano["relu3_4"], size=(2, 2), stride=(2, 2)) model_theano["conv4_1"] = layers.convolution_layer(model_theano["pool3"], conv4_1_W, conv4_1_b, border="same") model_theano["relu4_1"] = layers.relu_layer(model_theano["conv4_1"]) model_theano["conv4_2"] = layers.convolution_layer(model_theano["relu4_1"], conv4_2_W, conv4_2_b, border="same") model_theano["relu4_2"] = layers.relu_layer(model_theano["conv4_2"]) model_theano["conv4_3"] = layers.convolution_layer(model_theano["relu4_2"], conv4_3_W, conv4_3_b, border="same") model_theano["relu4_3"] = layers.relu_layer(model_theano["conv4_3"]) model_theano["conv4_4"] = layers.convolution_layer(model_theano["relu4_3"], conv4_4_W, conv4_4_b, border="same") model_theano["relu4_4"] = layers.relu_layer(model_theano["conv4_4"]) model_theano["pool4"] = layers.pooling_layer(model_theano["relu4_4"], size=(2, 2), stride=(2, 2)) model_theano["conv5_1"] = layers.convolution_layer(model_theano["pool4"], conv5_1_W, conv5_1_b, border="same") model_theano["relu5_1"] = layers.relu_layer(model_theano["conv5_1"]) model_theano["conv5_2"] = layers.convolution_layer(model_theano["relu5_1"], conv5_2_W, conv5_2_b, border="same") model_theano["relu5_2"] = layers.relu_layer(model_theano["conv5_2"]) model_theano["conv5_3"] = layers.convolution_layer(model_theano["relu5_2"], conv5_3_W, conv5_3_b, border="same") model_theano["relu5_3"] = layers.relu_layer(model_theano["conv5_3"]) model_theano["conv5_4"] = layers.convolution_layer(model_theano["relu5_3"], conv5_4_W, conv5_4_b, border="same") model_theano["relu5_4"] = layers.relu_layer(model_theano["conv5_4"]) model_theano["pool5"] = layers.pooling_layer(model_theano["relu5_4"], size=(2, 2), stride=(2, 2)) model_theano["fc6"] = layers.inner_product_layer(model_theano["pool5"], fc6_W, fc6_b) model_theano["relu6"] = layers.relu_layer(model_theano["fc6"]) model_theano["fc7"] = layers.inner_product_layer(model_theano["relu6"], fc7_W, fc7_b) model_theano["relu7"] = layers.relu_layer(model_theano["fc7"]) model_theano["fc8"] = layers.inner_product_layer(model_theano["relu7"], fc8_W, fc8_b) model_theano["prob"] = layers.softmax_layer(model_theano["fc8"]) # check data = np.random.randn(*model_caffe.blobs["data"].data.shape).astype(np.float32) * 10 model_caffe.blobs["data"].data[:] = data model_caffe.forward() theano_output = theano.function([model_theano["data"]], model_theano["prob"])(data) error = (theano_output.squeeze() - model_caffe.blobs["prob"].data.squeeze()).max() assert error < 1e-6 # save print "saving" if not os.path.exists(directory_theano): os.mkdir(directory_theano) sys.setrecursionlimit(100000) pickle.dump(model_theano, open(filename_theanomodel, "wb"), protocol=pickle.HIGHEST_PROTOCOL) print "done"
def loss(self, X, y=None, hprev=None, softmax=True): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - X: Input sentence, shape (N, T), T is the max length of sequence it's padded to. Each element is in the range 0 <= y[i, t] < V. N is the batch number. We need to trim it down to (N, TS). - y: labels for sentences, (N,) - hprev: (N, H), initial hidden state, when None, it will be initialized to all 0 - softmax: bool, Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ mode = 'test' if y is None else 'train' # we need to clean out self.updates, if loss() is called again if len(self.updates) != 0: self.updates = [] # affine transforming last hidden state to softmax # hidden state W_proj: (input_dim, hidden_dim) = (D, H) W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] H = self.hidden_dim N = self.batch_size TS = self.max_seq_length loss, grads = 0.0, {} ############################################################################ # Implement the forward passes for the SentimentRNN. # # In the forward pass it does the following: # # # # (0) We trim X down to TS max-sequence # # (1) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (2) initial hidden state is initialized at zero (wildML, theano official)# # (N, H) # # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to # # process the sequence of input word vectors and produce hidden state # # vectors for all timesteps, producing an array of shape (N, T, H). # # (4) transform the last hidden state with affine transformation # # (5) Use Softmax to produce a label for the sentence # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # ===== forward pass ===== # step (0) CUT X DOWN TO TIMESTEP # X (N, T) X = X[:, :TS] # step (1) # X (N, TS) out_word_embedded = word_embedding_forward(X, W_embed) # word embedding (N, TS, W) # step (2) if hprev is None: hprev = np.zeros((N, H), dtype=self.dtype) # hprev (N, H) # step (3) hs = None h_states_shapes = (N, TS, H) if self.cell_type == 'rnn': hs = rnn_forward(out_word_embedded, hprev, Wx, Wh, b, h_states_shapes) # last_h : (N, H) elif self.cell_type == 'lstm': hs = lstm_forward(out_word_embedded, hprev, Wx, Wh, b, h_states_shapes) last_hs = hs[:, -1, :] # step (4) # last_hs: (N, H), W_proj: (H, H), b_proj: (H,) # it shares the same dimensionality of hidden dimension of RNN out_aff = affine_layer(last_hs, W_proj, b_proj) # step (5) probs = softmax_layer(out_aff) if y is None: out = probs else: out = -T.mean(T.log(probs)[T.arange(y.shape[0]), y]) ############################################################################ # END OF YOUR CODE # ############################################################################ return out