def build_fft_scale(x, y, size): W = [] pnet = ll.InputLayer((None, 3, 101, 101), input_var=None) pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None) pnet = ll.NonlinearityLayer(ll.BatchNormLayer(pnet)) pnet = ll.Pool2DLayer(pnet, (3, 3), (2, 2)) pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None) pnet = ll.NonlinearityLayer( ll.BatchNormLayer(pnet), nonlinearity=l.nonlinearities.LeakyRectify(0.1)) pnet = ll.Conv2DLayer(pnet, 32, (3, 3), pad='same', nonlinearity=None) pnet = ll.BatchNormLayer(pnet) x_p, y_p = ll.get_output(pnet, x), ll.get_output(pnet, y) z_p = Customfftlayer(x_p, y_p) net = ll.InputLayer((None, 64, 50, 50), input_var=z_p) net = ll.BatchNormLayer(net) net = ll.NonlinearityLayer( ll.BatchNormLayer( ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None))) net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad') net = ll.NonlinearityLayer( ll.BatchNormLayer( ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None))) net = ll.BatchNormLayer(ll.Conv2DLayer(net, 10, (1, 1), nonlinearity=None)) # return scale different: x_new/x_lod-1 p_scale = ll.get_output(net) #p_scale = theano.gradient.disconnected_grad(p_scale) net_scale = ll.InputLayer((None, 10, 25, 25), p_scale) net_scale = ll.DenseLayer(net_scale, 100, b=None, nonlinearity=l.nonlinearities.tanh) W.append(net_scale.get_params(regularizable=True)[0]) net_scale = ll.DenseLayer(net_scale, 2, b=None, nonlinearity=None) # return heatmap with 2 times upsample of size net_heat = ll.DenseLayer(net, 500, b=None, nonlinearity=l.nonlinearities.tanh) W.append(net_heat.get_params(regularizable=True)[0]) net_heat = ll.DenseLayer(net, size**2, b=None, nonlinearity=None) W.append(net_heat.get_params(regularizable=True)[0]) net_heat = ll.BatchNormLayer(net_heat) net_heat = ll.ReshapeLayer(net_heat, ([0], 1, size, size)) net_heat = ll.Deconv2DLayer(net_heat, 64, (5, 5), (2, 2), b=None, crop='same', nonlinearity=None) net_heat = ll.BatchNormLayer(net_heat) net_heat = ll.Conv2DLayer(net_heat, 1, (3, 3), b=None, pad='same', nonlinearity=None) W.append(net_heat.get_params(regularizable=True)[0]) return pnet, net_scale, net_heat, W
def build_siamese(layer): """""" smx = nonlinearities.softmax lnr = nonlinearities.linear layers = L.get_all_layers(layer) nl = filter( lambda l: hasattr(l, 'nonlinearity') and ( (l.nonlinearity != smx) and (l.nonlinearity != lnr)), layers)[0].nonlinearity if len(layers[0].output_shape) == 3: Xl = T.tensor3('left') Xr = T.tensor3('right') elif len(layers[0].output_shape) == 4: Xl = T.tensor4('left') Xr = T.tensor4('right') Ol = L.get_output(layer, inputs=Xl) # Ol_vl = L.get_output(layer, inputs=Xl, deterministic=True) Or = L.get_output(layer, inputs=Xr) O = T.concatenate([Ol, Or], axis=-1) layer = L.InputLayer((None, layer.output_shape[-1] * 2), input_var=O) layer = L.DenseLayer(layer, 128, nonlinearity=None, name='hc1') layer = L.BatchNormLayer(layer) layer = L.NonlinearityLayer(layer, nonlinearity=nl) layer = L.DenseLayer(layer, 2, nonlinearity=smx) return layer, (Xl, Xr)
def build_network(self, mfcc_input_var): print('Building cnn with parameters:') pp = pprint.PrettyPrinter(indent=4) pp.pprint(self.net_opts) mfcc_network = layers.InputLayer((None, 130, MC_LENGTH), mfcc_input_var) mfcc_network = layers.BatchNormLayer(mfcc_network) mfcc_network = self.set_conv_layer(mfcc_network, 'conv_1', bnorm=False) mfcc_network = self.set_pool_layer(mfcc_network, 'pool_1') mfcc_network = self.set_conv_layer(mfcc_network, 'conv_2', bnorm=False) mfcc_network = self.set_pool_layer(mfcc_network, 'pool_2') for n in self.net_opts['layer_list']: # mfcc_network = layers.batch_norm(layers.DenseLayer(layers.dropout(mfcc_network, p=self.net_opts['dropout_p']), # n, # nonlinearity=lasagne.nonlinearities.rectify) # ) mfcc_network = layers.DenseLayer(layers.dropout(mfcc_network, p=self.net_opts['dropout_p']), n, nonlinearity=lasagne.nonlinearities.rectify) # mfcc_network = layers.BatchNormLayer(mfcc_network) mfcc_network = layers.DenseLayer(layers.dropout(mfcc_network, p=self.net_opts['dropout_p']), self.net_opts['num_class'], nonlinearity=lasagne.nonlinearities.softmax) self.network = mfcc_network return self.network
def __build_48_net__(self): model24 = self.subnet network = layers.InputLayer((None, 3, 48, 48), input_var=self.__input_var__) network = layers.Conv2DLayer(network, num_filters=64, filter_size=(5, 5), stride=1, nonlinearity=relu) network = layers.batch_norm( layers.MaxPool2DLayer(network, pool_size=(3, 3), stride=2)) network = layers.Conv2DLayer(network, num_filters=64, filter_size=(5, 5), stride=1, nonlinearity=relu) network = layers.BatchNormLayer(network) network = layers.MaxPool2DLayer(network, pool_size=(3, 3), stride=2) network = layers.DenseLayer(network, num_units=256, nonlinearity=relu) #network = layers.Conv2DLayer(network,num_filters=256,filter_size=(1,1),stride=1,nonlinearity=relu) denselayer24 = model24.net.input_layer network = layers.ConcatLayer([network, denselayer24]) network = layers.DenseLayer(network, num_units=2, nonlinearity=softmax) return network
def build_network(self, ra_input_var, mc_input_var): print('Building raw dnn with parameters:') pp = pprint.PrettyPrinter(indent=4) pp.pprint(self.net_opts) ra_network_1 = layers.InputLayer((None, 1, 3969), ra_input_var) ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_1', dropout=False, pad='same') ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_1') ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_2', pad='same') ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_2') ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_3', pad='same') ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_3') ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_4', pad='same') ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_4') concat_list = [ra_network_1] mc_input = layers.InputLayer((None, 2, MC_LENGTH), mc_input_var) concat_list.append(mc_input) network = layers.ConcatLayer(concat_list, axis=1, cropping=[None, None, 'center']) network = layers.BatchNormLayer(network) for n in self.net_opts['layer_list']: network = layers.DenseLayer(layers.dropout(network, p=self.net_opts['dropout_p']), n, nonlinearity=lasagne.nonlinearities.rectify) network = layers.DenseLayer(layers.dropout(network, p=self.net_opts['dropout_p']), self.net_opts['num_class'], nonlinearity=lasagne.nonlinearities.softmax) # print(layers.get_output_shape(network)) self.network = network return self.network
def build_TOY(x, y): z_p = T.concatenate((x, y), axis=1) net = ll.InputLayer((None, 2, 100, 100), input_var=z_p) net = ll.BatchNormLayer(net) net = ll.NonlinearityLayer( ll.BatchNormLayer( ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None))) net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad') net = ll.NonlinearityLayer( ll.BatchNormLayer( ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None))) net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad') net = ll.BatchNormLayer(ll.Conv2DLayer(net, 10, (1, 1), nonlinearity=None)) net = ll.DenseLayer(net, 625, b=None, nonlinearity=None) net = ll.ReshapeLayer(net, ([0], 1, 25, 25)) return net
def output_block(net, config, non_lin, verbose=True): """ """ # output setting out_acts = [] for out_act in config.hyper_parameters.out_act: exec('from lasagne.nonlinearities import {}'.format(out_act)) out_acts.append(eval(out_act)) n_outs = config.hyper_parameters.n_out # Global Average Pooling last_conv_block_name = next(reversed(net)) net['gap'] = L.GlobalPoolLayer(net[last_conv_block_name], name='gap') net['gap.bn'] = L.BatchNormLayer(net['gap'], name='gap.bn') n_features = net['gap.bn'].output_shape[-1] # feature Layer net['fc'] = L.dropout(L.batch_norm( L.DenseLayer(net['gap.bn'], num_units=n_features, nonlinearity=non_lin, name='fc')), name='fc.bn.do') # output (prediction) # check whether the model if for MTL or STL # target is passed as list, regardless whether # it's MTL or STL (configuration checker checks it) targets = config.target out_layer_names = [] for target, n_out, out_act in zip(targets, n_outs, out_acts): out_layer_names.append('out.{}'.format(target)) if target == 'self': net[out_layer_names[-1]], inputs = build_siamese(net['fc']) else: net[out_layer_names[-1]] = L.DenseLayer(net['fc'], num_units=n_out, nonlinearity=out_act, name=out_layer_names[-1]) inputs = [net['input'].input_var] # make a concatation layer just for save/load purpose net['IO'] = L.ConcatLayer([ L.FlattenLayer(net[target_layer_name]) if target == 'self' else net[target_layer_name] for target_layer_name in out_layer_names ], name='IO') if verbose: print(net['gap.bn'].output_shape) print(net['fc'].output_shape) for target in targets: print(net['out.{}'.format(target)].output_shape) return net, inputs
def BatchNormRecurrentLayer(incoming, num_units, nonlinearity=None, gradient_steps=-1, grad_clipping=0, layer_type=layers.CustomRecurrentLayer, name='', **kwargs): """ Helper method to define a Vanilla Recurrent Layer with batch normalization """ input_shape = incoming.output_shape # Define input to hidden connections in_to_hid_rf = layers.InputLayer((None, ) + input_shape[2:]) in_to_hid_rf = layers.DenseLayer(in_to_hid_rf, num_units, b=None, nonlinearity=None, name='ith_{0}'.format(name)) in_to_hid_rf_W = in_to_hid_rf.W # Use batch normalization in the input to hidden connections in_to_hid_rf = layers.BatchNormLayer(in_to_hid_rf, name='ith_bn_{0}'.format(name)) # Define hidden to hidden connections hid_to_hid_rf = layers.InputLayer((None, num_units)) hid_to_hid_rf = layers.DenseLayer(hid_to_hid_rf, num_units, b=None, nonlinearity=None, name='hth_{0}'.format(name)) l_r_f = layer_type(incoming, input_to_hidden=in_to_hid_rf, hidden_to_hidden=hid_to_hid_rf, gradient_steps=gradient_steps, grad_clipping=grad_clipping, nonlinearity=nonlinearity, name='l_r_{0}'.format(name), **kwargs) # Make layer parameters intuitively accessible l_r_f.W_in_to_hid = in_to_hid_rf_W l_r_f.W_hid_to_hid = hid_to_hid_rf.W l_r_f.beta = in_to_hid_rf.beta l_r_f.gamma = in_to_hid_rf.gamma l_r_f.mean = in_to_hid_rf.mean l_r_f.inv_std = in_to_hid_rf.inv_std l_r_f.hid_init = l_r_f.hid_init return l_r_f
def build_correlation_fft(x, y, size): pnet = ll.InputLayer((None, 3, 101, 101), input_var=None) pnet = ll.BatchNormLayer(pnet) pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None) pnet = ll.NonlinearityLayer( ll.BatchNormLayer(pnet), nonlinearity=l.nonlinearities.LeakyRectify(0.1)) pnet = ll.Pool2DLayer(pnet, (3, 3), stride=(2, 2)) pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None) pnet = ll.NonlinearityLayer( ll.BatchNormLayer(pnet), nonlinearity=l.nonlinearities.LeakyRectify(0.1)) pnet = ll.Conv2DLayer(pnet, 32, (3, 3), pad='same', nonlinearity=None) pnet = ll.BatchNormLayer(pnet) x_p, y_p = ll.get_output(pnet, x), ll.get_output(pnet, y) x_p, y_p = fft.rfft(x_p, 'ortho'), fft.rfft(y_p, 'ortho') XX, XY = T.zeros_like(x_p), T.zeros_like(y_p) XX = T.set_subtensor( XX[:, :, :, :, 0], x_p[:, :, :, :, 0] * x_p[:, :, :, :, 0] + x_p[:, :, :, :, 1] * x_p[:, :, :, :, 1]) XY = T.set_subtensor( XY[:, :, :, :, 0], x_p[:, :, :, :, 0] * y_p[:, :, :, :, 0] + x_p[:, :, :, :, 1] * y_p[:, :, :, :, 1]) XY = T.set_subtensor( XY[:, :, :, :, 1], x_p[:, :, :, :, 0] * y_p[:, :, :, :, 1] - x_p[:, :, :, :, 1] * y_p[:, :, :, :, 0]) xx = fft.irfft(XX, 'ortho') xy = fft.irfft(XY, 'ortho') z_p = T.concatenate((xx, xy), axis=1) z_p *= T.constant(hanningwindow(50)) net = ll.InputLayer((None, 64, 50, 50), input_var=z_p) net = ll.BatchNormLayer(net) net = ll.NonlinearityLayer( ll.BatchNormLayer( ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None))) net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad') net = ll.NonlinearityLayer( ll.BatchNormLayer( ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None))) net = ll.BatchNormLayer(ll.Conv2DLayer(net, 10, (1, 1), nonlinearity=None)) net = ll.DenseLayer(net, size**2, b=None, nonlinearity=None) net = ll.ReshapeLayer(net, ([0], 1, size, size)) return pnet, net
def build_network(self, mspec_input_var): print('Building spec dnn with parameters:') pp = pprint.PrettyPrinter(indent=4) pp.pprint(self.net_opts) mspec_network = layers.InputLayer((None, 130, MC_LENGTH), mspec_input_var) mspec_network = layers.BatchNormLayer(mspec_network) for n in self.net_opts['layer_list']: mspec_network = layers.DenseLayer(layers.dropout(mspec_network, p=self.net_opts['dropout_p']), n, nonlinearity=lasagne.nonlinearities.rectify) mspec_network = layers.DenseLayer(layers.dropout(mspec_network, p=self.net_opts['dropout_p']), self.net_opts['num_class'], nonlinearity=lasagne.nonlinearities.softmax) self.network = mspec_network return self.network
def build_transition_down(incoming, reduction, p=0.1, W_init=lasagne.init.GlorotUniform(), b_init=None): """"Builds a transition in the DenseNet model. Transitions consist of the sequence: Batch Normalization, 1x1 Convolution, 2x2 Average Pooling. The channels can be compressed by specifying 0 < m <= 1, where num_channels = channels * m. """ num_filters = int(incoming.output_shape[1] * reduction) network = nn.BatchNormLayer(incoming) network = nn.NonlinearityLayer(network, lasagne.nonlinearities.rectify) network = nn.Conv2DLayer(network, num_filters, 1, W=W_init, b=b_init) if p > 0: network = nn.DropoutLayer(network, p=p) return nn.Pool2DLayer(network, 2, 2, mode='max')
def conv2d(incoming, n_filters, filter_size, stride, pool_size, nonlinearity, batch_norm, name, verbose, *args, **kwargs): """""" if stride is None: stride = (1, 1) layer = L.Conv2DLayer(incoming, num_filters=n_filters, filter_size=filter_size, stride=stride, pad='same', nonlinearity=None, name=name) if batch_norm: name += '.bn' layer = L.BatchNormLayer(layer, name=name) name += '.nonlin' layer = L.NonlinearityLayer(layer, nonlinearity=nonlinearity) return layer
def build_block( incoming, num_layers, num_filters, use_linear_skip=True, filter_size=3, p=0.1, W_init=lasagne.init.GlorotUniform(), b_init=None, nonlinearity=lasagne.nonlinearities.rectify, ): """Builds a block in the DenseNet model.""" feature_maps = [incoming] for i in xrange(num_layers): if len(feature_maps) == 1: network = incoming else: network = nn.ConcatLayer(feature_maps, axis=1) network = nn.BatchNormLayer(network) network = nn.NonlinearityLayer(network, nonlinearity) network = nn.Conv2DLayer(network, num_filters, filter_size, pad='same', W=W_init, b=b_init) if p > 0: network = nn.DropoutLayer(network, p=p) feature_maps.append(network) # Whether to return all connections (vanilla DenseNet), or to return only # those feature maps created in the current block used in upscale path for # semantic segmentation (100 layer tiramisu) if use_linear_skip: return nn.ConcatLayer(feature_maps, axis=1) return nn.ConcatLayer(feature_maps[1:], axis=1)
def build_segmenter_simple_absurd_res(): sys.setrecursionlimit(1500) inp = ll.InputLayer(shape=(None, 1, None, None), name='input') n_layers = 64 # should get a 128 x 128 receptive field layers = [inp] for i in range(n_layers): # every 2 layers, add a skip connection layers.append( ll.Conv2DLayer(layers[-1], num_filters=8, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv%d' % (i + 1))) layers.append(ll.BatchNormLayer(layers[-1], name='bn%i' % (i + 1))) if (i % 2 == 0) and (i != 0): layers.append( ll.ElemwiseSumLayer([ layers[-1], # prev layer layers[-6], ] # 3 actual layers per block, skip the previous block )) layers.append(ll.NonlinearityLayer(layers[-1], nonlinearity=rectify)) # our output layer is also convolutional, remember that our Y is going to be the same exact size as the conv_final = ll.Conv2DLayer(layers[-1], num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), name='conv_final', nonlinearity=linear) # we need to reshape it to be a (batch*n*m x 3), i.e. unroll s.t. the feature dimension is preserved softmax = Softmax4D(conv_final, name='4dsoftmax') return [softmax]
incomings, merge_function=T.maximum) concat = lambda axis=1: lambda incomings: layers.ConcatLayer(incomings, axis=axis) noise = lambda sigma=0.1: lambda incoming: \ layers.GaussianNoiseLayer(incoming, sigma=sigma) if sigma is not None and sigma > 0 else incoming nothing = lambda incoming: incoming dense = lambda num_units, f=None: lambda incoming: \ layers.DenseLayer(incoming, num_units=num_units, nonlinearity=(nonlinearities.LeakyRectify(0.05) if f is None else f)) dropout = lambda p=0.1, rescale=True: lambda incoming: \ layers.DropoutLayer(incoming, p=p, rescale=rescale) if p is not None else incoming batch_norm = lambda axes='auto': lambda incoming: layers.BatchNormLayer( incoming, axes=axes) class Select(object): def __getitem__(self, item): return lambda incomings: incomings[item] select = Select() take = select nonlinearity = lambda f=None: lambda incoming: layers.NonlinearityLayer( incoming, (nonlinearities.LeakyRectify(0.05) if f is None else f)) elementwise = lambda f=T.add: lambda incomings: layers.ElemwiseMergeLayer( incomings, f)
def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim, story_len, patches, cnn_dim_fc, truncate_gradient, learning_rate, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.truncate_gradient = truncate_gradient self.learning_rate = learning_rate self.trng = RandomStreams(1234) self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.patches = patches self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind_lmdb( self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind_lmdb( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) # This is the local patch of each image. self.input_var = T.tensor4( 'input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.tensor3( 'q_var') # Now, it's a batch * story_len * image_sieze. self.answer_var = T.ivector( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_idx = T.imatrix('answer_idx') # batch x seq self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) logging.info('self.cnn_dim_fc = %d', self.cnn_dim_fc) logging.info('self.dim = %d', self.dim) self.W_q_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_q_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) # seq x cnn x batch. def _dot(x, W, b): return T.tanh(T.dot(W, x) + b.dimshuffle(0, 'x')) q_var_shuffled_emb, _ = theano.scan( fn=_dot, sequences=q_var_shuffled, non_sequences=[self.W_q_emb_in, self.b_q_emb_in]) #print 'q_var_shuffled_emb', q_var_shuffled_emb.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_var_emb = q_var_shuffled_emb.dimshuffle(2, 0, 1) # batch x seq x emb_size q_var_emb_ext = q_var_emb.dimshuffle(0, 'x', 1, 2) q_var_emb_ext = T.repeat(q_var_emb_ext, q_var_emb.shape[1], 1) # batch x seq x seq x emb_size q_var_emb_rhp = T.reshape( q_var_emb, (q_var_emb.shape[0] * q_var_emb.shape[1], q_var_emb.shape[2])) q_var_emb_ext_rhp = T.reshape( q_var_emb_ext, (q_var_emb_ext.shape[0] * q_var_emb_ext.shape[1], q_var_emb_ext.shape[2], q_var_emb_ext.shape[3])) q_var_emb_ext_rhp = q_var_emb_ext_rhp.dimshuffle(0, 2, 1) q_idx = T.arange(self.story_len).dimshuffle('x', 0) q_idx = T.repeat(q_idx, self.batch_size, axis=0) q_idx = T.reshape(q_idx, (q_idx.shape[0] * q_idx.shape[1], )) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_rhp = T.reshape( self.input_var, (self.batch_size * self.story_len * self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0) inp_rhp_emb = T.dot( self.W_inp_emb_in, inp_rhp_dimshuffled) + self.b_inp_emb_in.dimshuffle(0, 'x') inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0) inp_emb_raw = T.reshape( inp_rhp_emb_dimshuffled, (self.batch_size * self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh( inp_emb_raw ) # Just follow the paper DMN for visual and textual QA. self.inp_c = inp_emb.dimshuffle(1, 2, 0) logging.info('building question module') self.W_qf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_qf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_qf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_var_shuffled_emb_reversed = q_var_shuffled_emb[:: -1, :, :] # seq x emb_size x batch q_glb, _ = theano.scan(fn=self.q_gru_step_forward, sequences=q_var_shuffled_emb_reversed, outputs_info=[T.zeros_like(inp_dummy)]) q_glb_shuffled = q_glb.dimshuffle(2, 0, 1) # batch_size * seq_len * dim q_glb_last = q_glb_shuffled[:, -1, :] # batch_size * dim q_net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=q_var_emb_rhp) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) self.q_q = layers.get_output(q_net).dimshuffle(1, 0) #print "==> creating parameters for memory module" logging.info('creating parameters for memory module') self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update1 = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_upd1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update2 = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_upd2 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update3 = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_upd3 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update = [ self.W_mem_update1, self.W_mem_update2, self.W_mem_update3 ] self.b_mem_update = [self.b_mem_upd1, self.b_mem_upd2, self.b_mem_upd3] self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) logging.info( '==> building episodic memory module (fixed number of steps: %d)', self.memory_hops) memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) # Replace GRU with ReLU activation + MLP. c = T.concatenate([memory[iter - 1], current_episode], axis=0) cur_mem = T.dot(self.W_mem_update[iter - 1], c) + self.b_mem_update[iter - 1].dimshuffle( 0, 'x') memory.append(T.nnet.relu(cur_mem)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_emb = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.vocab_size + 1)) self.b_inp_emb = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def _dot2(x, W, b): #return T.tanh(T.dot(W, x) + b.dimshuffle(0,'x')) return T.dot(W, x) + b.dimshuffle(0, 'x') answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=[self.W_inp_emb, self.b_inp_emb]) # seq x dim x batch init_ans = T.concatenate([self.q_q, last_mem], axis=0) # dim x (batch x self.story_len) mem_ans = T.dot(self.W_mem_emb, init_ans) + self.b_mem_emb.dimshuffle( 0, 'x') # dim x (batchsize x self.story_len) #mem_ans_dim = mem_ans.dimshuffle('x',0,1) mem_ans_rhp = T.reshape(mem_ans.dimshuffle( 1, 0), (self.batch_size, self.story_len, mem_ans.shape[0])) mem_ans_dim = mem_ans_rhp.dimshuffle(1, 2, 0) answer_inp = answer_inp_var_shuffled_emb #answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) #seq + 1 x dim x (batch-size x self.story+len) # Now, each answer got its input, our next step is to obtain the sequences. answer_inp_shu = answer_inp.dimshuffle(2, 0, 1) answer_inp_shu_rhp = T.reshape(answer_inp_shu, (self.batch_size, self.story_len, answer_inp_shu.shape[1],\ answer_inp_shu.shape[2])) answer_inp = answer_inp_shu_rhp.dimshuffle( 1, 2, 3, 0) # story_len x seq + 1 x dim x batch_size self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_map = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_ans_map = nn_utils.constant_param(value=0.0, shape=(self.dim, )) results = None r = None dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) for i in range(self.story_len): answer_inp_i = answer_inp[i, :] # seq + 1 x dim x batch_size mem_ans_dim_i = mem_ans_dim[i, :] # dim x batch_size if i == 0: q_glb_inp = q_glb_last.dimshuffle('x', 1, 0) #1 x dim x batch_size answer_inp_i = T.concatenate([q_glb_inp, answer_inp_i], axis=0) init_h = T.concatenate([dummy, mem_ans_dim_i], axis=0) init_h = T.dot(self.W_ans_map, init_h) + self.b_ans_map.dimshuffle(0, 'x') init_h = T.tanh(init_h) r, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp_i, truncate_gradient=self.truncate_gradient, outputs_info=[init_h]) r = r[1:, :] # get rid of the first glob one. results = r.dimshuffle('x', 0, 1, 2) else: prev_h = r[self.answer_idx[:, i], :, T.arange(self.batch_size)] h_ = T.concatenate([prev_h.dimshuffle(1, 0), mem_ans_dim_i], axis=0) h_ = T.dot(self.W_ans_map, h_) + self.b_ans_map.dimshuffle( 0, 'x') h_ = T.tanh(h_) r, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp_i, truncate_gradient=self.truncate_gradient, outputs_info=[h_]) results = T.concatenate([results, r.dimshuffle('x', 0, 1, 2)]) ## results: story_len x seq+1 x dim x batch_size results = results.dimshuffle(3, 0, 1, 2) results = T.reshape(results, (self.batch_size * self.story_len, results.shape[2], results.shape[3])) results = results.dimshuffle(1, 2, 0) # seq_len x dim x (batch x seq) # Assume there is a start token #print 'results', results.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'), # self.q_var: np.random.rand(2,5, 4096).astype('float32'), # self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'), # self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')}) #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), # self.q_var: np.random.rand(3, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) #print 'prob', prob.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'), # self.q_var: np.random.rand(2,5, 4096).astype('float32'), # self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'), # self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')}) #preds = prob[1:,:,:] #prob = prob[1:-1,:,:] preds = prob prob = prob[:-1, :, :] prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab preds_shuffled = preds.dimshuffle(2, 0, 1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), # self.q_var: np.random.rand(3, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. #print 'prob_sm', prob_sm.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')}) #print 'lbl', loss_vec.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')}) # This one is for the beamsearch. self.pred = T.reshape( preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, self.b_inp_emb_in, self.W_q_emb_in, self.b_q_emb_in, #self.W_glb_att_1, self.W_glb_att_2, self.b_glb_att_1, self.b_glb_att_2, self.W_qf_res_in, self.W_qf_res_hid, self.b_qf_res, self.W_qf_upd_in, self.W_qf_upd_hid, self.b_qf_upd, self.W_qf_hid_in, self.W_qf_hid_hid, self.b_qf_hid, self.W_mem_emb, self.W_inp_emb, self.b_mem_emb, self.b_inp_emb, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b #self.W_mem_emb, self.W_inp_emb,self.b_mem_emb, self.b_inp_emb, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, self.W_ans_map, self.b_ans_map, ] self.params += self.W_mem_update self.params += self.b_mem_update print "==> building loss layer and computing updates" reward_prob = prob_sm[T.arange(n), lbl] reward_prob = T.reshape( reward_prob, (prob_shuffled.shape[0], prob_shuffled.shape[1])) #reward_prob = printing.Print('mean_r')(reward_prob) loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) #loss_vec = T.nnet.categorical_crossentropy(prob_sm, T.flatten(self.answer_var)) #print 'loss_vec', loss_vec.shape.eval({prob_sm: np.random.rand(39,8900).astype('float32'), # lbl: np.random.rand(39,).astype('int32')}) self.loss_ce = (mask * loss_vec).sum() / mask.sum() print 'loss_ce', self.loss_ce.eval({ prob_sm: np.random.rand(39, 8900).astype('float32'), lbl: np.random.rand(39, ).astype('int32'), mask: np.random.rand(39, ).astype('float32') }) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 grads = T.grad(self.loss, wrt=self.params, disconnected_inputs='raise') updates = lasagne.updates.adadelta(grads, self.params, learning_rate=self.learning_rate) if self.mode == 'train': logging.info("compiling train_fn") self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var, self.answer_idx ], outputs=[self.prediction, self.loss], updates=updates) logging.info("compiling test_fn") self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var, self.answer_idx ], outputs=[self.prediction, self.loss]) logging.info("compiling pred_fn") self.pred_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_inp_var, self.answer_idx ], outputs=[self.pred])
def __init__( self, image_shape, filter_shape, num_class, conv_type, kernel_size, kernel_pool_size, dropout_rate, ): """ """ self.filter_shape = filter_shape self.n_visible = numpy.prod(image_shape) self.n_layers = len(filter_shape) self.rng = RandomStreams(123) self.x = T.matrix() self.y = T.ivector() self.conv_layers = [] NoiseLayer = layers.DropoutLayer dropout_rate = float(dropout_rate) self.l_input = layers.InputLayer((None, self.n_visible), self.x) this_layer = layers.ReshapeLayer(self.l_input, ([0], ) + image_shape) for l in range(self.n_layers): activation = lasagne.nonlinearities.rectify if len(filter_shape[l]) == 3: if conv_type == 'double' and filter_shape[l][1] > kernel_size: this_layer = DoubleConvLayer( this_layer, filter_shape[l][0], filter_shape[l][1:], pad='same', nonlinearity=activation, kernel_size=kernel_size, kernel_pool_size=kernel_pool_size) this_layer = layers.batch_norm(this_layer) elif conv_type == 'maxout': this_layer = layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], b=None, pad='same', nonlinearity=None) this_layer = layers.FeaturePoolLayer( this_layer, pool_size=kernel_pool_size**2) this_layer = layers.BatchNormLayer(this_layer) this_layer = layers.NonlinearityLayer( this_layer, activation) elif conv_type == 'cyclic': this_layers = [] this_layers.append( layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], b=None, pad='same', nonlinearity=None)) for _ in range(3): W = this_layers[-1].W.dimshuffle(0, 1, 3, 2)[:, :, :, ::-1] this_layers.append( layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], W=W, b=None, pad='same', nonlinearity=None)) this_layer = layers.ElemwiseMergeLayer( this_layers, T.maximum) this_layer = layers.BatchNormLayer(this_layer) this_layer = layers.NonlinearityLayer( this_layer, activation) elif conv_type == 'standard' \ or (conv_type == 'double' and filter_shape[l][1] <= kernel_size): this_layer = layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], pad='same', nonlinearity=activation) this_layer = layers.batch_norm(this_layer) else: raise NotImplementedError self.conv_layers.append(this_layer) elif len(filter_shape[l]) == 2: this_layer = layers.MaxPool2DLayer(this_layer, filter_shape[l]) this_layer = NoiseLayer(this_layer, dropout_rate) elif len(filter_shape[l]) == 1: raise NotImplementedError self.top_conv_layer = this_layer this_layer = layers.GlobalPoolLayer(this_layer, T.mean) self.clf_layer = layers.DenseLayer(this_layer, num_class, W=lasagne.init.Constant(0.), nonlinearity=T.nnet.softmax) self.params = layers.get_all_params(self.clf_layer, trainable=True) self.params_all = layers.get_all_params(self.clf_layer)
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.num_units = rnn_num_units self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print "==> building network" example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) ######### network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) print layers.get_output(network).eval({self.input_var: example}).shape # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) self.params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) output = output.transpose((0, 3, 1, 2)) output = output.flatten(ndim=3) # NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations num_channels = 32 filter_W = 852 filter_H = 8 # InputLayer network = layers.InputLayer(shape=(None, filter_W, num_channels * filter_H), input_var=output) print layers.get_output(network).eval({self.input_var: example}).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) if (self.dropout > 0): network = layers.dropout(network, self.dropout) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var: example}).shape self.params += layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) #print "==> param shapes", [x.eval().shape for x in self.params] self.loss_ce = lasagne.objectives.categorical_crossentropy( self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty( self.params, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) # good one updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim, story_len, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.story_len = story_len self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc = self._process_input_sind_lmdb( self.data_dir, 'train') self.val_dict_story, self.val_lmdb_env_fc = self._process_input_sind_lmdb( self.data_dir, 'val') self.test_dict_story, self.test_lmdb_env_fc = self._process_input_sind_lmdb( self.data_dir, 'test') self.train_story = self.train_dict_story.keys() self.val_story = self.val_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.q_var = T.tensor3( 'q_var') # Now, it's a batch * story_len * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) q_seq = self.q_var.dimshuffle(0, 'x', 1, 2) q_seq_rpt = T.repeat(q_seq, self.story_len, 1) q_seq_rhp = T.reshape(q_seq_rpt, (q_seq_rpt.shape[0] * q_seq_rpt.shape[1], q_seq_rpt.shape[2], q_seq_rpt.shape[3])) inp_var_shuffled = q_seq_rhp.dimshuffle(1, 2, 0) #seq x cnn x batch def _dot(x, W, b): return T.dot(W, x) + b.dimshuffle(0, 'x') inp_c_hist, _ = theano.scan( fn=_dot, sequences=inp_var_shuffled, non_sequences=[self.W_inp_emb_in, self.b_inp_emb_in]) #inp_c_hist,_ = theano.scan(fn = _dot, sequences=self.input_var, non_sequences = [self.W_inp_emb_in, self.b_inp_emb_in]) self.inp_c = inp_c_hist # seq x emb x batch print "==> building question module" # Now, share the parameter with the input module. q_var_shuffled = self.q_var.dimshuffle( 1, 2, 0) # now: story_len * image_size * batch_size # This is the RNN used to produce the Global Glimpse self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_glb, _ = theano.scan(fn=self.input_gru_step_forward, sequences=q_var_shuffled, outputs_info=[T.zeros_like(inp_dummy)]) q_glb_shuffled = q_glb.dimshuffle(2, 0, 1) # batch_size * seq_len * dim q_glb_last = q_glb_shuffled[:, -1, :] # batch_size * dim # Now, we also need to build the individual model. #q_var_shuffled = self.q_var.dimshuffle(1,0) q_single = T.reshape( self.q_var, (self.q_var.shape[0] * self.q_var.shape[1], self.q_var.shape[2])) q_single_shuffled = q_single.dimshuffle(1, 0) #cnn_dim x batch_size * 5 # batch_size * 5 x dim q_hist = T.dot(self.W_inp_emb_in, q_single_shuffled) + self.b_inp_emb_in.dimshuffle( 0, 'x') q_hist_shuffled = q_hist.dimshuffle(1, 0) # batch_size * 5 x dim if self.batch_norm: logging.info("Using batch normalization.") q_net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=q_hist_shuffled) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) #last_mem = layers.get_output(q_net).dimshuffle((1, 0)) self.q_q = layers.get_output(q_net).dimshuffle(1, 0) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 3)) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=self.W_inp_emb) # seq x dim x batch # dim x batch_size * 5 q_glb_dim = q_glb_last.dimshuffle(0, 'x', 1) # batch_size * 1 * dim q_glb_repmat = T.repeat(q_glb_dim, self.story_len, 1) # batch_size * len * dim q_glb_rhp = T.reshape(q_glb_repmat, (q_glb_repmat.shape[0] * q_glb_repmat.shape[1], q_glb_repmat.shape[2])) init_ans = T.concatenate( [self.q_q, last_mem, q_glb_rhp.dimshuffle(1, 0)], axis=0) mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize. mem_ans_dim = mem_ans.dimshuffle('x', 0, 1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis=0) dummy = theano.shared( np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp, outputs_info=[dummy]) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) preds = prob[1:, :, :] prob = prob[1:-1, :, :] prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab preds_shuffled = preds.dimshuffle(2, 0, 1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. # This one is for the beamsearch. self.pred = T.reshape( preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, self.b_inp_emb_in, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, self.W_mem_emb, self.W_inp_emb ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss]) print "==> compiling pred_fn" self.pred_fn = theano.function( inputs=[self.q_var, self.answer_inp_var], outputs=[self.pred])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.type = "batch" self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.tensor3( 'input_var') # (batch_size, seq_len, glove_dim) self.q_var = T.tensor3('question_var') # as self.input_var self.answer_var = T.ivector( 'answer_var') # answer of example in minibatch self.fact_count_var = T.ivector( 'fact_count_var') # number of facts in the example of minibatch self.input_mask_var = T.imatrix( 'input_mask_var') # (batch_size, indices) print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) input_var_shuffled = self.input_var.dimshuffle(1, 2, 0) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=input_var_shuffled, outputs_info=T.zeros_like(inp_dummy)) inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1) inp_c_list = [] inp_c_mask_list = [] for batch_index in range(self.batch_size): taken = inp_c_history_shuffled[batch_index].take( self.input_mask_var[ batch_index, :self.fact_count_var[batch_index]], axis=0) inp_c_list.append( T.concatenate([ taken, T.zeros((self.input_mask_var.shape[1] - taken.shape[0], self.dim), floatX) ])) inp_c_mask_list.append( T.concatenate([ T.ones((taken.shape[0], ), np.int32), T.zeros((self.input_mask_var.shape[1] - taken.shape[0], ), np.int32) ])) self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0) inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0) q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) q_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_q_history, _ = theano.scan(fn=self.input_gru_step, sequences=q_var_shuffled, outputs_info=T.zeros_like(q_dummy)) self.q_q = q_q_history[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared( np.zeros((self.vocab_size, self.batch_size), dtype=floatX)) results, updates = theano.scan( fn=self.answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], #(last_mem, y) n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") self.prediction = self.prediction.dimshuffle(1, 0) self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).mean() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var ], outputs=[self.prediction, self.loss])
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, batch_norm, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.l2 = l2 self.mode = mode self.num_units = rnn_num_units self.batch_norm = batch_norm self.input_var = T.tensor3('input_var') self.answer_var = T.ivector('answer_var') # scale inputs to be in [-1, 1] input_var_norm = 2 * self.input_var - 1 print "==> building network" example = np.random.uniform(size=(self.batch_size, 858, 256), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) ######### # InputLayer network = layers.InputLayer(shape=(None, 858, 256), input_var=input_var_norm) print layers.get_output(network).eval({self.input_var: example}).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units) print layers.get_output(network).eval({self.input_var: example}).shape # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) print layers.get_output(network).eval({ self.input_var: example }).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) print layers.get_output(network).eval({self.input_var: example}).shape # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var: example}).shape self.params = layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) self.loss_ce = lasagne.objectives.categorical_crossentropy( self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def build_kpextractor128(): inp = ll.InputLayer(shape=(None, 1, 128, 128), name='input') # alternate pooling and conv layers to minimize parameters filter_pad = lambda x, y: (x // 2, y // 2) filter3 = (3, 3) same_pad3 = filter_pad(*filter3) conv1 = ll.Conv2DLayer(inp, num_filters=16, filter_size=filter3, pad=same_pad3, W=Orthogonal(), nonlinearity=rectify, name='conv1') mp1 = ll.MaxPool2DLayer(conv1, 2, stride=2) # now down to 64 x 64 bn1 = ll.BatchNormLayer(mp1) conv2 = ll.Conv2DLayer(bn1, num_filters=32, filter_size=filter3, pad=same_pad3, W=Orthogonal(), nonlinearity=rectify, name='conv2') mp2 = ll.MaxPool2DLayer(conv2, 2, stride=2) # now down to 32 x 32 bn2 = ll.BatchNormLayer(mp2) conv3 = ll.Conv2DLayer(bn2, num_filters=64, filter_size=filter3, pad=same_pad3, W=Orthogonal(), nonlinearity=rectify, name='conv3') mp3 = ll.MaxPool2DLayer(conv3, 2, stride=2) # now down to 16 x 16 bn3 = ll.BatchNormLayer(mp3) conv4 = ll.Conv2DLayer(bn3, num_filters=128, filter_size=filter3, pad=same_pad3, W=Orthogonal(), nonlinearity=rectify, name='conv4') mp4 = ll.MaxPool2DLayer(conv4, 2, stride=2) # now down to 8 x 8 bn4 = ll.BatchNormLayer(mp4) conv5 = ll.Conv2DLayer(bn4, num_filters=256, filter_size=filter3, pad=same_pad3, W=Orthogonal(), nonlinearity=rectify, name='conv5') mp5 = ll.MaxPool2DLayer(conv5, 2, stride=2) # down to 4 x 4 bn5 = ll.BatchNormLayer(mp5) conv6 = ll.Conv2DLayer(bn5, num_filters=512, filter_size=filter3, pad=same_pad3, W=Orthogonal(), nonlinearity=rectify, name='conv6') mp6 = ll.MaxPool2DLayer(conv6, 2, stride=2) # down to 4 x 4 bn6 = ll.BatchNormLayer(mp6) # now let's bring it down to a FC layer that takes in the 2x2x64 mp4 output fc1 = ll.DenseLayer(bn6, num_units=256, nonlinearity=rectify) bn1_fc = ll.BatchNormLayer(fc1) #dp1 = ll.DropoutLayer(bn1, p=0.5) fc2 = ll.DenseLayer(bn1_fc, num_units=64, nonlinearity=rectify) #dp2 = ll.DropoutLayer(fc2, p=0.5) bn2_fc = ll.BatchNormLayer(fc2) out = ll.DenseLayer(bn2_fc, num_units=6, nonlinearity=linear) out_rs = ll.ReshapeLayer(out, ([0], 3, 2)) return out_rs
def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient, learning_rate, dim, cnn_dim, cnn_dim_fc, story_len, patches, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.learning_rate = learning_rate self.truncate_gradient = truncate_gradient self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.mode = mode self.patches = patches self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout #self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.vocab, self.ivocab = self._ext_vocab_from_word2vec() self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind( self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) # Since this is pretty expensive, we will pass a story each time. # We assume that the input has been processed such that the sequences of patches # are snake like path. self.input_var = T.tensor4( 'input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # First, we embed the visual features before sending it to the bi-GRUs. inp_rhp = T.reshape( self.input_var, (self.batch_size * self.story_len * self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0) inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled) inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0) inp_emb_raw = T.reshape( inp_rhp_emb_dimshuffled, (self.batch_size, self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh( inp_emb_raw ) # Just follow the paper DMN for visual and textual QA. # Now, we use a bi-directional GRU to produce the input. # Forward GRU. self.inp_dim = self.dim / 2 # since we have forward and backward self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) # Backward GRU. self.W_inpb_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpb_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpb_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) # Now, we use the GRU to build the inputs. # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one. inp_dummy = theano.shared( np.zeros((self.inp_dim, self.story_len), dtype=floatX)) for i in range(self.batch_size): if i == 0: inp_1st_f, _ = theano.scan( fn=self.input_gru_step_forward, sequences=inp_emb[i, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) inp_1st_b, _ = theano.scan( fn=self.input_gru_step_backward, sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) # Now, combine them. inp_1st = T.concatenate([ inp_1st_f.dimshuffle(2, 0, 1), inp_1st_b.dimshuffle(2, 0, 1) ], axis=-1) self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2) else: inp_f, _ = theano.scan( fn=self.input_gru_step_forward, sequences=inp_emb[i, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) inp_b, _ = theano.scan( fn=self.input_gru_step_backward, sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) # Now, combine them. inp_fb = T.concatenate( [inp_f.dimshuffle(2, 0, 1), inp_b.dimshuffle(2, 0, 1)], axis=-1) self.inp_c = T.concatenate( [self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis=0) # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim # Eventually, we can flattern them. # Now, the input dimension is 1024 because we have forward and backward. inp_c_t = T.reshape( self.inp_c, (self.batch_size, self.story_len * self.patches, self.dim)) inp_c_t_dimshuffled = inp_c_t.dimshuffle(0, 'x', 1, 2) inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis=1) # Now, its ready for all the 5 images in the same story. # 50 * 980 * 512 self.inp_batch = T.reshape(inp_batch, (inp_batch.shape[0] * inp_batch.shape[1], inp_batch.shape[2], inp_batch.shape[3])) self.inp_batch_dimshuffled = self.inp_batch.dimshuffle( 1, 2, 0) # 980 x 512 x 50 # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) print "==> building question module" # Now, share the parameter with the input module. self.W_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, )) q_var_shuffled = self.q_var.dimshuffle(1, 0) inp_q = T.dot( self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle( 0, 'x') # 512 x 50 self.q_q = T.tanh( inp_q ) # Since this is used to initialize the memory, we need to make it tanh. print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) logging.info('last_mem size') print last_mem.shape.eval({ self.input_var: np.random.rand(10, 5, 196, 512).astype('float32'), self.q_var: np.random.rand(50, 4096).astype('float32') }) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=self.W_inp_emb) # seq x dim x batch # Now, we also need to embed the image and use it to do the memory. #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch. init_ans = T.concatenate([self.q_q, last_mem], axis=0) mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize. mem_ans_dim = mem_ans.dimshuffle('x', 0, 1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis=0) # Now, we have both embedding. We can let them go to the rnn. # We also need to map the input layer as well. dummy = theano.shared( np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) logging.info('answer_inp size') #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32')}) #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp, outputs_info=[dummy]) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') results = results[ 1: -1, :, :] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) self.prediction = prob_sm mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, #self.b_inp_emb_in, self.W_inpf_res_in, self.W_inpf_res_hid, self.b_inpf_res, self.W_inpf_upd_in, self.W_inpf_upd_hid, self.b_inpf_upd, self.W_inpf_hid_in, self.W_inpf_hid_hid, self.b_inpf_hid, self.W_inpb_res_in, self.W_inpb_res_hid, self.b_inpb_res, self.W_inpb_upd_in, self.W_inpb_upd_hid, self.b_inpb_upd, self.W_inpb_hid_in, self.W_inpb_hid_hid, self.b_inpb_hid, self.W_inp_emb_q, self.b_inp_emb_q, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_mem_emb, self.W_inp_emb, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.adam(self.loss, self.params, learning_rate = self.learning_rate) updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=self.learning_rate) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss], updates=updates) #profile = True) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss])
def build_segmenter_jet_preconv(): # downsample down to a small region, then upsample all the way back up, using jet architecture # recreate basic FCN-8s structure (though more aptly 1s here since we upsample back to the original input size) # this jet will have another conv layer in the final upsample # difference here is that instead of combining softmax layers in the jet, we'll upsample before the conv_f* layer # this will certainly make the model slower, but should give us better predictions... # The awkward part here is combining the intermediate conv layers when they have different filter shapes # We could: # concat them # have intermediate conv layers that bring them to the shape needed then merge them # in the interests of speed we'll just concat them, though we'll have a ton of filters at the end inp = ll.InputLayer(shape=(None, 1, None, None), name='input') conv1 = ll.Conv2DLayer(inp, num_filters=32, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv1_1') bn1 = ll.BatchNormLayer(conv1, name='bn1') conv2 = ll.Conv2DLayer(conv1, num_filters=64, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv1_2') bn2 = ll.BatchNormLayer(conv2, name='bn2') mp1 = ll.MaxPool2DLayer(conv2, 2, stride=2, name='mp1') # 2x downsample conv3 = ll.Conv2DLayer(mp1, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv2_1') bn3 = ll.BatchNormLayer(conv3, name='bn3') conv4 = ll.Conv2DLayer(conv3, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv2_2') bn4 = ll.BatchNormLayer(conv4, name='bn4') mp2 = ll.MaxPool2DLayer(conv4, 2, stride=2, name='mp2') # 4x downsample conv5 = ll.Conv2DLayer(mp2, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv3_1') bn5 = ll.BatchNormLayer(conv5, name='bn5') conv6 = ll.Conv2DLayer(conv5, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv3_2') bn6 = ll.BatchNormLayer(conv6, name='bn6') mp3 = ll.MaxPool2DLayer(conv6, 2, stride=2, name='mp3') # 8x downsample conv7 = ll.Conv2DLayer(mp3, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv4_1') bn7 = ll.BatchNormLayer(conv7, name='bn7') conv8 = ll.Conv2DLayer(conv7, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv4_2') bn8 = ll.BatchNormLayer(conv8, name='bn8') # f 68 s 8 # now start the upsample ## FIRST UPSAMPLE PREDICTION (akin to FCN-32s) up8 = ll.Upscale2DLayer( bn8, 8, name='upsample_8x') # take loss here, 8x upsample from 8x downsample conv_f8 = ll.Conv2DLayer(up8, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_8xpred') softmax_8 = Softmax4D(conv_f8, name='4dsoftmax_8x') ## COMBINE BY UPSAMPLING CONV 8 AND CONV 6 conv_8_up2 = ll.Upscale2DLayer(bn8, 2, name='upsample_c8_2') # 4x downsample concat_c8_c6 = ll.ConcatLayer([conv_8_up2, bn6], axis=1, name='concat_c8_c6') up4 = ll.Upscale2DLayer( concat_c8_c6, 4, name='upsample_4x') # take loss here, 4x upsample from 4x downsample conv_f4 = ll.Conv2DLayer(up4, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_4xpred') softmax_4 = Softmax4D(conv_f4, name='4dsoftmax_4x') # 4x downsample ## COMBINE BY UPSAMPLING CONCAT_86 AND CONV 4 concat_86_up2 = ll.Upscale2DLayer( concat_c8_c6, 2, name='upsample_concat_86_2') # 2x downsample concat_ct86_c4 = ll.ConcatLayer([concat_86_up2, bn4], axis=1, name='concat_ct86_c4') up2 = ll.Upscale2DLayer( concat_ct86_c4, 2, name='upsample_2x' ) # final loss here, 2x upsample from a 2x downsample conv_f2 = ll.Conv2DLayer(up2, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_2xpred') softmax_2 = Softmax4D(conv_f2, name='4dsoftmax_2x') ## COMBINE BY UPSAMPLING CONCAT_864 AND CONV 2 concat_864_up2 = ll.Upscale2DLayer( concat_ct86_c4, 2, name='upsample_concat_86_2') # no downsample concat_864_c2 = ll.ConcatLayer([concat_864_up2, bn2], axis=1, name='concat_ct864_c2') conv_f1 = ll.Conv2DLayer(concat_864_c2, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_1xpred') softmax_1 = Softmax4D(conv_f1, name='4dsoftmax_1x') # this is where up1 would go but that doesn't make any sense return [softmax_8, softmax_4, softmax_2, softmax_1]
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') gx_init = sparse.csr_matrix('gx', dtype='float32') gy_init = T.ivector('gy') gz_init = T.vector('gz') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) x_to_label = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) x_to_emd = layers.SparseLayer(x_input, self.embedding_size) W = x_to_emd.W x_to_emd = layers.DenseLayer(x_to_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) x_concat = lgl.ConcatLayer([x_to_label, x_to_emd], axis=1) x_concat = layers.DenseLayer(x_concat, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) pred = lgl.get_output(x_concat) step_loss = lgo.categorical_crossentropy(pred, y_init).mean() hid_loss = lgl.get_output(x_to_label) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(x_to_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = lgl.get_all_params(x_concat) step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init], step_loss, updates=step_updates) self.test_fn = theano.function([x_init], pred) # supervised train gx_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_init) gx_to_emd = layers.SparseLayer(gx_input, self.embedding_size, W=W) gx_to_emd = lgl.DenseLayer(gx_to_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) gx_pred = lgl.get_output(gx_to_emd) g_loss = lgo.categorical_crossentropy(gx_pred, gy_init).sum() sup_params = lgl.get_all_params(gx_to_emd) sup_updates = lg.updates.sgd(g_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([gx_init, gy_init, gz_init], g_loss, updates=sup_updates, on_unused_input='ignore') # handle lstm input cross_entropy = lgo.categorical_crossentropy(gx_pred, gy_init) cross_entropy = T.reshape(cross_entropy, (1, self.subpath_num), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) sub_path_batch1 = sparse.csr_matrix('x', dtype='float32') sub_path_input1 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch1) sub_path_batch2 = sparse.csr_matrix('x', dtype='float32') sub_path_input2 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch2) sub_path_batch3 = sparse.csr_matrix('x', dtype='float32') sub_path_input3 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch3) sub_path_batch4 = sparse.csr_matrix('x', dtype='float32') sub_path_input4 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch4) sub_path_emd1 = layers.SparseLayer(sub_path_input1, self.embedding_size, W=W) sub_path_emd1 = T.reshape(lgl.get_output(sub_path_emd1), (self.subpath_num, 1, self.embedding_size)) sub_path_emd2 = layers.SparseLayer(sub_path_input2, self.embedding_size, W=W) sub_path_emd2 = T.reshape(lgl.get_output(sub_path_emd2), (self.subpath_num, 1, self.embedding_size)) sub_path_emd3 = layers.SparseLayer(sub_path_input3, self.embedding_size, W=W) sub_path_emd3 = T.reshape(lgl.get_output(sub_path_emd3), (self.subpath_num, 1, self.embedding_size)) sub_path_emd4 = layers.SparseLayer(sub_path_input4, self.embedding_size, W=W) sub_path_emd4 = T.reshape(lgl.get_output(sub_path_emd4), (self.subpath_num, 1, self.embedding_size)) sub_path_concat = T.concatenate([sub_path_emd1, sub_path_emd2, sub_path_emd3, sub_path_emd4], axis=1) sub_path_concat_layer = lgl.InputLayer(shape=(None, self.window_size + 1, self.embedding_size), input_var=sub_path_concat) # lstm layer lstm_layer = lgl.LSTMLayer(sub_path_concat_layer, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params = lgl.get_all_params(lstm_layer, trainable=True) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([gx_init, gy_init, gz_init, sub_path_batch1, sub_path_batch2, sub_path_batch3, sub_path_batch4, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([gx_init, gy_init, gz_init, sub_path_batch1, sub_path_batch2, sub_path_batch3, sub_path_batch4, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')
def build_network(): batch_norm = False num_units = 500 # rnn hidden units number l2 = 0.0 # l2 regularization dropout = 0.5 input_var = T.tensor4('input_var') answer_var = T.ivector('answer_var') print('==> building network') example = np.random.uniform(size=(batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) answer = np.random.randint(low=0, high=176, size=(batch_size, )) network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=input_var) print(layers.get_output(network).eval({input_var: example}).shape) # conv-relu-pool 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) # conv-relu-pool 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) # conv-relu-pool 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) # conv-relu-pool 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) output = output.transpose((0, 3, 1, 2)) output = output.flatten(ndim=3) # This params is important num_channels = 32 filter_w = 54 filter_h = 8 network = layers.InputLayer(shape=(None, filter_w, num_channels * filter_h), input_var=output) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.GRULayer(incoming=network, num_units=num_units, only_return_final=True) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) if dropout > 0: network = layers.dropout(network, dropout) # last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print(layers.get_output(network).eval({input_var: example}).shape) params += layers.get_all_params(network, trainable=True) prediction = layers.get_output(network) print('==> param shapes', [x.eval().shape for x in params]) loss_ce = lasagne.objectives.categorical_crossentropy( prediction, answer_var).mean() if l2 > 0: loss_l2 = l2 * lasagne.regularization.apply_penalty( params, lasagne.regularization.l2) else: loss_l2 = 0 loss = loss_ce + loss_l2 # updates = lasagne.updates.adadelta(loss, params) updates = lasagne.updates.momentum(loss, params, learning_rate=0.003) # good one # updates = lasagne.updates.momentum(loss, params, learning_rate=0.0003) # good one print('==> compiling train_fn') train_fn = theano.function(inputs=[input_var, answer_var], outputs=[prediction, loss], updates=updates) test_fn = theano.function(inputs=[input_var, answer_var], outputs=[prediction, loss]) return train_fn, test_fn
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): print ("==> not used params in DMN class:", kwargs.keys()) self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.num_units = rnn_num_units self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print ("==> building network") example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) print (layers.get_output(network).eval({self.input_var:example}).shape) # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) self.params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) num_channels = 32 filter_W = 54 filter_H = 8 # NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations channels = [] for channel_index in range(num_channels): channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) rnn_network_outputs = [] W_in_to_updategate = None W_hid_to_updategate = None b_updategate = None W_in_to_resetgate = None W_hid_to_resetgate = None b_resetgate = None W_in_to_hidden_update = None W_hid_to_hidden_update = None b_hidden_update = None W_in_to_updategate1 = None W_hid_to_updategate1 = None b_updategate1 = None W_in_to_resetgate1 = None W_hid_to_resetgate1 = None b_resetgate1 = None W_in_to_hidden_update1 = None W_hid_to_hidden_update1 = None b_hidden_update1 = None for channel_index in range(num_channels): rnn_input_var = channels[channel_index] # InputLayer network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) if (channel_index == 0): # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False) W_in_to_updategate = network.W_in_to_updategate W_hid_to_updategate = network.W_hid_to_updategate b_updategate = network.b_updategate W_in_to_resetgate = network.W_in_to_resetgate W_hid_to_resetgate = network.W_hid_to_resetgate b_resetgate = network.b_resetgate W_in_to_hidden_update = network.W_in_to_hidden_update W_hid_to_hidden_update = network.W_hid_to_hidden_update b_hidden_update = network.b_hidden_update # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) W_in_to_updategate1 = network.W_in_to_updategate W_hid_to_updategate1 = network.W_hid_to_updategate b_updategate1 = network.b_updategate W_in_to_resetgate1 = network.W_in_to_resetgate W_hid_to_resetgate1 = network.W_hid_to_resetgate b_resetgate1 = network.b_resetgate W_in_to_hidden_update1 = network.W_in_to_hidden_update W_hid_to_hidden_update1 = network.W_hid_to_hidden_update b_hidden_update1 = network.b_hidden_update # add params self.params += layers.get_all_params(network, trainable=True) else: # GRULayer, but shared network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False, resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # GRULayer, but shared network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1), updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1), hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1)) rnn_network_outputs.append(layers.get_output(network)) all_output_var = T.concatenate(rnn_network_outputs, axis=1) print (all_output_var.eval({self.input_var:example}).shape) # InputLayer network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) # Dropout Layer if (self.dropout > 0): network = layers.dropout(network, self.dropout) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print (layers.get_output(network).eval({self.input_var:example}).shape) self.params += layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) #print "==> param shapes", [x.eval().shape for x in self.params] self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print ("==> compiling train_fn") self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print ("==> compiling test_fn") self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def conv_nonl(data, num_filters, name, pad, use_bn=True): res = conv(data, num_filters, name, pad=pad) if (use_bn): res = L.BatchNormLayer(res, name='bn_' + name) res = L.NonlinearityLayer(res, rectify, name='relu_' + name) return res
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print "==> building network" example = np.random.uniform(size=(self.batch_size, 1, 256, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### network = layers.InputLayer(shape=(None, 1, 256, 858), input_var=self.input_var) print layers.get_output(network).eval({self.input_var:example}).shape # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 5 network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 6 network = layers.Conv2DLayer(incoming=network, num_filters=256, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(3, 2), ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # DENSE 1 network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) if (self.dropout > 0): network = layers.dropout(network, self.dropout) print layers.get_output(network).eval({self.input_var:example}).shape """ # DENSE 2 network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) if (self.dropout > 0): network = layers.dropout(network, self.dropout) print layers.get_output(network).eval({self.input_var:example}).shape """ # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var:example}).shape self.params = layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') g_init = T.imatrix('g') ind_init = T.ivector('ind') sub_path_init = T.imatrix('subPathsBatch') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init) ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init) pair_second = lgl.SliceLayer(g_input, indices=1, axis=1) pair_first = lgl.SliceLayer(g_input, indices=0, axis=1) pair_first_emd = lgl.EmbeddingLayer(pair_first, input_size=self.num_ver, output_size=self.embedding_size) emd_to_numver = layers.DenseLayer( pair_first_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) index_emd = lgl.EmbeddingLayer(ind_input, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) x_to_ydim = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) index_emd = layers.DenseLayer(index_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1) concat_two = layers.DenseLayer(concat_two, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two_output = lgl.get_output(concat_two) step_loss = lgo.categorical_crossentropy(concat_two_output, y_init).mean() hid_loss = lgl.get_output(x_to_ydim) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(index_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = [ index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W, concat_two.b ] step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init, ind_init], step_loss, updates=step_updates, on_unused_input='ignore') self.test_fn = theano.function([x_init, ind_init], concat_two_output, on_unused_input='ignore') # supervised train fc_output = lgl.get_output(emd_to_numver) pair_second_output = lgl.get_output(pair_second) sup_loss = lgo.categorical_crossentropy(fc_output, pair_second_output).sum() sup_params = lgl.get_all_params(emd_to_numver, trainable=True) sup_updates = lg.updates.sgd(sup_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([g_init], sup_loss, updates=sup_updates, on_unused_input='ignore') cross_entropy = lgo.categorical_crossentropy(fc_output, pair_second_output) cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=sub_path_init) sub_path_emd = lgl.EmbeddingLayer(subPath_in, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) lstm_layer = lgl.LSTMLayer(sub_path_emd, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True) lstm_params = list(set(lstm_params_all).difference(set(sup_params))) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')