def build_network(self, vocab_size, input_var, mask_var, W_init): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_fwd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_all_1 = L.concat([l_fwd_1, l_bkd_1], axis=2) l_fwd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_bkd_1_slice = L.SliceLayer(l_bkd_1, 0, 1) y_1 = L.ElemwiseSumLayer([l_fwd_1_slice, l_bkd_1_slice]) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) l_bkd_2_slice = L.SliceLayer(l_bkd_2, 0, 1) y_2 = L.ElemwiseSumLayer([l_fwd_2_slice, l_bkd_2_slice]) y = L.concat([y_1, y_2], axis=1) g = L.DenseLayer(y, num_units=EMBED_DIM, nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def build_autoencoder_network(): input_var = T.tensor4('input_var'); layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var); layer = batch_norm(layers.Conv2DLayer(layer, 100, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 120, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 120, filter_size=(1,1), stride=1, pad='same', nonlinearity=leaky_rectify)); pool1 = layers.MaxPool2DLayer(layer, (2, 2), 2); layer = batch_norm(layers.Conv2DLayer(pool1, 240, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 320, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 320, filter_size=(1,1), stride=1, pad='same', nonlinearity=leaky_rectify)); pool2 = layers.MaxPool2DLayer(layer, (2, 2), 2); layer = batch_norm(layers.Conv2DLayer(pool2, 640, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); prely = batch_norm(layers.Conv2DLayer(layer, 1024, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); featm = batch_norm(layers.Conv2DLayer(prely, 640, filter_size=(1,1), nonlinearity=leaky_rectify)); feat_map = batch_norm(layers.Conv2DLayer(featm, 100, filter_size=(1,1), nonlinearity=rectify, name="feat_map")); maskm = batch_norm(layers.Conv2DLayer(prely, 100, filter_size=(1,1), nonlinearity=leaky_rectify)); mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1,1), nonlinearity=None), beta=None, gamma=None); mask_map = SoftThresPerc(mask_rep, perc=90.0, alpha=0.1, beta=init.Constant(0.5), tight=100.0, name="mask_map"); layer = ChInnerProdMerge(feat_map, mask_map, name="encoder"); layer = batch_norm(layers.Deconv2DLayer(layer, 1024, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(1,1), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = layers.InverseLayer(layer, pool2); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 120, filter_size=(1,1), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = layers.InverseLayer(layer, pool1); layer = batch_norm(layers.Deconv2DLayer(layer, 120, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 100, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = layers.Deconv2DLayer(layer, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); glblf = batch_norm(layers.Conv2DLayer(prely, 128, filter_size=(1,1), nonlinearity=leaky_rectify)); glblf = layers.Pool2DLayer(glblf, pool_size=(5,5), stride=5, mode='average_inc_pad'); glblf = batch_norm(layers.Conv2DLayer(glblf, 64, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1,1), nonlinearity=rectify), name="global_feature"); glblf = batch_norm(layers.Deconv2DLayer(glblf, 256, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(9,9), stride=5, crop=(2,2), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); layer = layers.ElemwiseSumLayer([layer, glblf]); network = ReshapeLayer(layer, ([0], -1)); mask_var = lasagne.layers.get_output(mask_map); output_var = lasagne.layers.get_output(network); return network, input_var, mask_var, output_var;
def build_network(self): l_char1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[0]) l_char2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[1]) l_mask1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[2]) l_mask2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[3]) l_char_in = L.ConcatLayer([l_char1_in, l_char2_in], axis=1) # B x (ND+NQ) x L l_char_mask = L.ConcatLayer([l_mask1_in, l_mask2_in], axis=1) shp = (self.inps[0].shape[0], self.inps[0].shape[1] + self.inps[1].shape[1], self.inps[1].shape[2]) l_index_reshaped = L.ReshapeLayer(l_char_in, (shp[0] * shp[1], shp[2])) # BN x L l_mask_reshaped = L.ReshapeLayer(l_char_mask, (shp[0] * shp[1], shp[2])) # BN x L l_lookup = L.EmbeddingLayer(l_index_reshaped, self.num_chars, self.char_dim) # BN x L x D l_fgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, only_return_final=True, mask_input=l_mask_reshaped) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, backwards=True, only_return_final=True, mask_input=l_mask_reshaped) # BN x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_char_embed = L.ReshapeLayer(l_embed, (shp[0], shp[1], self.embed_dim / 2)) l_embed1 = L.SliceLayer(l_char_embed, slice(0, self.inps[0].shape[1]), axis=1) l_embed2 = L.SliceLayer(l_char_embed, slice(-self.inps[1].shape[1], None), axis=1) return l_embed1, l_embed2
def build_autoencoder_network(): input_var = T.tensor4('input_var'); layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var); layer = batch_norm(layers.Conv2DLayer(layer, 100, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 120, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = layers.Pool2DLayer(layer, pool_size=(2,2), stride=2, mode='average_inc_pad'); layer = batch_norm(layers.Conv2DLayer(layer, 240, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 320, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = layers.Pool2DLayer(layer, pool_size=(2,2), stride=2, mode='average_inc_pad'); layer = batch_norm(layers.Conv2DLayer(layer, 640, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); prely = batch_norm(layers.Conv2DLayer(layer, 1024, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); featm = batch_norm(layers.Conv2DLayer(prely, 640, filter_size=(1,1), nonlinearity=leaky_rectify)); feat_map = batch_norm(layers.Conv2DLayer(featm, 100, filter_size=(1,1), nonlinearity=rectify, name="feat_map")); layer = feat_map; layer = batch_norm(layers.Deconv2DLayer(layer, 1024, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 240, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 120, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 100, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = layers.Deconv2DLayer(layer, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); glblf = batch_norm(layers.Conv2DLayer(prely, 128, filter_size=(1,1), nonlinearity=leaky_rectify)); glblf = layers.Pool2DLayer(glblf, pool_size=(5,5), stride=5, mode='average_inc_pad'); glblf = batch_norm(layers.Conv2DLayer(glblf, 64, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1,1), nonlinearity=rectify), name="global_feature"); glblf = batch_norm(layers.Deconv2DLayer(glblf, 256, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(9,9), stride=5, crop=(2,2), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); layer = layers.ElemwiseSumLayer([layer, glblf]); network = ReshapeLayer(layer, ([0], -1)); output_var = lasagne.layers.get_output(network); return network, input_var, output_var;
def get_conv_input(self, sidx, tidx, avg=False): suf = '_avg' if avg else '' feat_embs = [ self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg) for name in self.args.source_feats ] # TODO: change the meaning if self.args.lex == 'mix': concat_emb = L.ElemwiseSumLayer(feat_embs) # (100, 15, 256) else: concat_emb = L.concat(feat_embs, axis=2) # (100, 15, 256+100) pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] * (self.args.window_size / 2)).astype( theano.config.floatX) post = theano.shared(pos[np.newaxis, :, np.newaxis], borrow=True) # (1, 15, 1) posl = L.InputLayer( (None, self.args.window_size, 1), input_var=T.extra_ops.repeat(post, sidx.shape[0], axis=0)) # (100, 15, 1) conv_in = L.concat([concat_emb, posl], axis=2) # (100, 15, 256+1) if self.args.pos_emb: posint = L.flatten( L.ExpressionLayer(posl, lambda x: T.cast(x, 'int64'))) # (100, 15) pos_emb = L.EmbeddingLayer( posint, self.args.window_size, 8, name='epos' + suf, W=Normal(0.01) if not avg else Constant()) # (100, 15, 8) pos_emb.params[pos_emb.W].remove('regularizable') conv_in = L.concat([concat_emb, posl, pos_emb], axis=2) # (100, 15, 256+1+8) # # squeeze # if self.args.squeeze: # conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2, # W=HeNormal('relu')) # (100, 15, 256) conv_in = L.dimshuffle(conv_in, (0, 2, 1)) # (100, 256+1, 15) return conv_in
def build_model(n_input, n_hidden, optimizer=adagrad, l2_weight=1e-4, l1_weight=1e-2): ''' build NN model to estimating model function ''' global LR input_A = L.InputLayer((None, n_input), name='A') layer_A = L.DenseLayer(input_A, n_hidden, b=None, nonlinearity=identity) input_B = L.InputLayer((None, n_input), name='B') layer_B = L.DenseLayer(input_B, n_hidden, b=None, nonlinearity=identity) merge_layer = L.ElemwiseSumLayer((layer_A, layer_B)) output_layer = L.DenseLayer(merge_layer, 1, b=None, nonlinearity=identity) # output is scalar x1 = T.matrix('x1') x2 = T.matrix('x2') y = T.matrix('y') out = L.get_output(output_layer, {input_A: x1, input_B: x2}) params = L.get_all_params(output_layer) loss = T.mean(squared_error(out, y)) # add l1 penalty l1_penalty = regularize_layer_params([layer_A, layer_B, output_layer], l1) # add l2 penalty l2_penalty = regularize_layer_params([layer_A, layer_B, output_layer], l2) # get loss + penalties loss = loss + l1_penalty * l1_weight + l2_penalty * l2_weight updates_sgd = optimizer(loss, params, learning_rate=LR) updates = apply_momentum(updates_sgd, params, momentum=0.9) # updates = optimizer(loss,params,learning_rate=LR) f_train = theano.function([x1, x2, y], loss, updates=updates) f_test = theano.function([x1, x2, y], loss) f_out = theano.function([x1, x2], out) return f_train, f_test, f_out, output_layer
def _build_network(self, state_var, action_var): """Builds critic network:; inputs: (state, action), outputs: Q-val.""" # States -> Hidden state_in = nn.InputLayer((None, ) + self.state_shape, state_var) states = nn.DenseLayer(state_in, 30, W_init, b_init, relu) states = nn.DenseLayer(states, 30, W_init, b_init, nonlinearity=None) # Actions -> Hidden action_in = nn.InputLayer((None, self.num_actions), action_var) actions = nn.DenseLayer(action_in, 30, W_init, b=None, nonlinearity=None) # States_h + Actions_h -> Output net = nn.ElemwiseSumLayer([states, actions]) net = nn.NonlinearityLayer(net, relu) return nn.DenseLayer(net, 1, W_out, b_out, nonlinearity=None)
def model(self, query_input, batch_size, query_vocab_size, context_vocab_size, emb_dim_size): l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input) l_embed_continuous = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_values_discrete = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_probabilities_discrete = L.NonlinearityLayer( l_values_discrete, nonlinearity=lasagne.nonlinearities.softmax) l_embed_discrete = StochasticLayer(l_probabilities_discrete, estimator='MF') l_merge = L.ElemwiseSumLayer([l_embed_continuous, l_embed_discrete]) l_out = L.DenseLayer(l_merge, num_units=emb_dim_size, nonlinearity=lasagne.nonlinearities.softmax) l_merge_2 = L.ElemwiseMergeLayer([l_out, l_embed_discrete], merge_function=T.mul) l_final_out = L.DenseLayer(l_merge_2, num_units=context_vocab_size) return l_values_discrete, l_final_out
def build_segmenter_simple_absurd_res(): sys.setrecursionlimit(1500) inp = ll.InputLayer(shape=(None, 1, None, None), name='input') n_layers = 64 # should get a 128 x 128 receptive field layers = [inp] for i in range(n_layers): # every 2 layers, add a skip connection layers.append( ll.Conv2DLayer(layers[-1], num_filters=8, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv%d' % (i + 1))) layers.append(ll.BatchNormLayer(layers[-1], name='bn%i' % (i + 1))) if (i % 2 == 0) and (i != 0): layers.append( ll.ElemwiseSumLayer([ layers[-1], # prev layer layers[-6], ] # 3 actual layers per block, skip the previous block )) layers.append(ll.NonlinearityLayer(layers[-1], nonlinearity=rectify)) # our output layer is also convolutional, remember that our Y is going to be the same exact size as the conv_final = ll.Conv2DLayer(layers[-1], num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), name='conv_final', nonlinearity=linear) # we need to reshape it to be a (batch*n*m x 3), i.e. unroll s.t. the feature dimension is preserved softmax = Softmax4D(conv_final, name='4dsoftmax') return [softmax]
def build_autoencoder_network(): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 180, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 120, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) maskm = batch_norm( layers.Conv2DLayer(prely, 120, filter_size=(1, 1), nonlinearity=leaky_rectify)) mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1, 1), nonlinearity=None), beta=None, gamma=None) mask_map = SoftThresPerc(mask_rep, perc=99.9, alpha=0.5, beta=init.Constant(0.5), tight=100.0, name="mask_map") layer = ChInnerProdMerge(feat_map, mask_map, name="encoder") layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 100, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(5, 5), stride=5, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) glblf = batch_norm(layers.Conv2DLayer(glblf, 3, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(9, 9), stride=5, crop=(2, 2), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 48, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 48, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 48, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) layers.set_all_param_values(network, pickle.load(open(filename_model_ae, 'rb'))) feat_var = lasagne.layers.get_output(feat_map, deterministic=True) mask_var = lasagne.layers.get_output(mask_map, deterministic=True) outp_var = lasagne.layers.get_output(network, deterministic=True) return network, input_var, feat_var, mask_var, outp_var
def build_autoencoder_network(): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 120, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 240, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 640, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 1024, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 640, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 100, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) mask_map = SoftThresPerc(feat_map, perc=98.4, alpha=0.1, beta=init.Constant(0.5), tight=20.0, name="mask_map") layer = mask_map layer = batch_norm( layers.Deconv2DLayer(layer, 1024, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 240, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 120, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 128, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(5, 5), stride=5, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) glblf = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(glblf, 256, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(9, 9), stride=5, crop=(2, 2), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) mask_var = lasagne.layers.get_output(mask_map) output_var = lasagne.layers.get_output(network) return network, input_var, mask_var, output_var
def build_autoencoder_network(): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 80, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 180, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 120, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) maskm = batch_norm( layers.Conv2DLayer(prely, 100, filter_size=(1, 1), nonlinearity=leaky_rectify)) mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1, 1), nonlinearity=None), beta=None, gamma=None) mask_map = SoftThresPerc(mask_rep, perc=90.0, alpha=0.5, beta=init.Constant(0.1), tight=100.0, name="mask_map") layer = ChInnerProdMerge(feat_map, mask_map, name="encoder") layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 80, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 100, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(20, 20), stride=20, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) glblf = batch_norm(layers.Conv2DLayer(glblf, 3, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Upscale2DLayer(glblf, scale_factor=20) glblf = batch_norm( layers.Deconv2DLayer(glblf, 48, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 48, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 48, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) mask_var = lasagne.layers.get_output(mask_map) output_var = lasagne.layers.get_output(network) return network, input_var, mask_var, output_var
def build_model(vocab_size, doc_var, qry_var, doc_mask_var, qry_mask_var, W_init=lasagne.init.Normal()): l_doc_in = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qry_in = L.InputLayer(shape=(None, None, 1), input_var=qry_var) l_doc_embed = L.EmbeddingLayer(l_doc_in, vocab_size, EMBED_DIM, W=W_init) l_qry_embed = L.EmbeddingLayer(l_qry_in, vocab_size, EMBED_DIM, W=l_doc_embed.W) l_doc_mask = L.InputLayer(shape=(None, None), input_var=doc_mask_var) l_qry_mask = L.InputLayer(shape=(None, None), input_var=qry_mask_var) l_doc_fwd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_doc_bkd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_qry_fwd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_qry_bkd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_fwd_slice = L.SliceLayer(l_doc_fwd, -1, 1) l_doc_bkd_slice = L.SliceLayer(l_doc_bkd, 0, 1) l_qry_fwd_slice = L.SliceLayer(l_qry_fwd, -1, 1) l_qry_bkd_slice = L.SliceLayer(l_qry_bkd, 0, 1) r = L.DenseLayer(L.ElemwiseSumLayer([l_doc_fwd_slice, l_doc_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) u = L.DenseLayer(L.ElemwiseSumLayer([l_qry_fwd_slice, l_qry_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) g = L.DenseLayer(L.concat([r, u], axis=1), num_units=EMBED_DIM, W=lasagne.init.GlorotNormal(), nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_doc_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def build_network_from_ae(classn): input_var = T.tensor4('input_var'); layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var); layer = batch_norm(layers.Conv2DLayer(layer, 100, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 120, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = layers.Pool2DLayer(layer, pool_size=(2,2), stride=2, mode='average_inc_pad'); layer = batch_norm(layers.Conv2DLayer(layer, 240, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 320, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = layers.Pool2DLayer(layer, pool_size=(2,2), stride=2, mode='average_inc_pad'); layer = batch_norm(layers.Conv2DLayer(layer, 640, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); prely = batch_norm(layers.Conv2DLayer(layer, 1024, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); featm = batch_norm(layers.Conv2DLayer(prely, 640, filter_size=(1,1), nonlinearity=leaky_rectify)); feat_map = batch_norm(layers.Conv2DLayer(featm, 100, filter_size=(1,1), nonlinearity=rectify, name="feat_map")); maskm = batch_norm(layers.Conv2DLayer(prely, 100, filter_size=(1,1), nonlinearity=leaky_rectify)); mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1,1), nonlinearity=None), beta=None, gamma=None); mask_map = SoftThresPerc(mask_rep, perc=0.0, alpha=0.1, beta=init.Constant(0.5), tight=100.0, bias=-10, name="mask_map"); enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder"); layer = batch_norm(layers.Deconv2DLayer(enlyr, 1024, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 240, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 120, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 100, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = layers.Deconv2DLayer(layer, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); glblf = batch_norm(layers.Conv2DLayer(prely, 128, filter_size=(1,1), nonlinearity=leaky_rectify)); glblf = layers.Pool2DLayer(glblf, pool_size=(5,5), stride=5, mode='average_inc_pad'); glblf = batch_norm(layers.Conv2DLayer(glblf, 64, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); gllyr = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1,1), nonlinearity=rectify), name="global_feature"); glblf = batch_norm(layers.Deconv2DLayer(gllyr, 256, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(9,9), stride=5, crop=(2,2), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); layer = layers.ElemwiseSumLayer([layer, glblf]); network = ReshapeLayer(layer, ([0], -1)); layers.set_all_param_values(network, pickle.load(open(filename_model_ae, 'rb'))); mask_map.beta.set_value(np.float32(-10.0*mask_map.beta.get_value())); # Adding more layers aug_var = T.matrix('aug_var'); target_var = T.imatrix('targets'); add_a = layers.Conv2DLayer(enlyr, 320, filter_size=(1,1), nonlinearity=leaky_rectify); add_b = layers.Conv2DLayer(add_a, 320, filter_size=(1,1), nonlinearity=leaky_rectify); add_c = layers.Conv2DLayer(add_b, 320, filter_size=(1,1), nonlinearity=leaky_rectify); add_d = layers.Conv2DLayer(add_c, 320, filter_size=(1,1), nonlinearity=leaky_rectify); add_0 = layers.Pool2DLayer(add_d, pool_size=(15,15), stride=15, mode='average_inc_pad'); add_1 = layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify); add_2 = layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify); add_3 = layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify); add_4 = layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify); aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var); cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1); hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify); network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid); new_params = [add_a.W, add_a.b, add_b.W, add_b.b, add_c.W, add_c.b, add_d.W, add_d.b, add_1.W, add_1.b, add_2.W, add_2.b, add_3.W, add_3.b, add_4.W, add_4.b, hidden_layer.W, hidden_layer.b, network.W, network.b]; return network, new_params, input_var, aug_var, target_var;
def build_network_from_ae(classn): input_var = T.tensor4('input_var'); layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var); layer = batch_norm(layers.Conv2DLayer(layer, 100, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 120, filter_size=(5,5), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = layers.Pool2DLayer(layer, pool_size=(2,2), stride=2, mode='average_inc_pad'); layer = batch_norm(layers.Conv2DLayer(layer, 240, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Conv2DLayer(layer, 320, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); layer = layers.Pool2DLayer(layer, pool_size=(2,2), stride=2, mode='average_inc_pad'); layer = batch_norm(layers.Conv2DLayer(layer, 640, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); prely = batch_norm(layers.Conv2DLayer(layer, 1024, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); featm = batch_norm(layers.Conv2DLayer(prely, 640, filter_size=(1,1), nonlinearity=leaky_rectify)); feat_map = batch_norm(layers.Conv2DLayer(featm, 100, filter_size=(1,1), nonlinearity=rectify, name="feat_map")); maskm = batch_norm(layers.Conv2DLayer(prely, 100, filter_size=(1,1), nonlinearity=leaky_rectify)); mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1,1), nonlinearity=None), beta=None, gamma=None); mask_map = SoftThresPerc(mask_rep, perc=98.4, alpha=0.1, beta=init.Constant(0.5), tight=100.0, name="mask_map"); enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder"); layer = batch_norm(layers.Deconv2DLayer(enlyr, 1024, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 640, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 320, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 240, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 120, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = batch_norm(layers.Deconv2DLayer(layer, 100, filter_size=(5,5), stride=1, crop='same', nonlinearity=leaky_rectify)); layer = layers.Deconv2DLayer(layer, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); glblf = batch_norm(layers.Conv2DLayer(prely, 128, filter_size=(1,1), nonlinearity=leaky_rectify)); glblf = layers.Pool2DLayer(glblf, pool_size=(5,5), stride=5, mode='average_inc_pad'); glblf = batch_norm(layers.Conv2DLayer(glblf, 64, filter_size=(3,3), stride=1, pad='same', nonlinearity=leaky_rectify)); gllyr = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1,1), nonlinearity=rectify), name="global_feature"); glblf = batch_norm(layers.Deconv2DLayer(gllyr, 256, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(9,9), stride=5, crop=(2,2), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 128, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 64, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(4,4), stride=2, crop=(1,1), nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = batch_norm(layers.Deconv2DLayer(glblf, 32, filter_size=(3,3), stride=1, crop='same', nonlinearity=leaky_rectify)); glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1,1), stride=1, crop='same', nonlinearity=identity); layer = layers.ElemwiseSumLayer([layer, glblf]); network = ReshapeLayer(layer, ([0], -1)); layers.set_all_param_values(network, pickle.load(open(filename_model_ae, 'rb'))); old_params = layers.get_all_params(network, trainable=True); # Adding more layers aug_var = T.matrix('aug_var'); target_var = T.imatrix('targets'); add_a = batch_norm(layers.Conv2DLayer(enlyr, 320, filter_size=(1,1), nonlinearity=leaky_rectify)); add_b = batch_norm(layers.Conv2DLayer(add_a, 320, filter_size=(1,1), nonlinearity=leaky_rectify)); add_c = batch_norm(layers.Conv2DLayer(add_b, 320, filter_size=(1,1), nonlinearity=leaky_rectify)); add_d = batch_norm(layers.Conv2DLayer(add_c, 320, filter_size=(1,1), nonlinearity=leaky_rectify)); add_0 = layers.Pool2DLayer(add_d, pool_size=(25,25), stride=25, mode='average_inc_pad'); add_1 = batch_norm(layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify)); add_2 = batch_norm(layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify)); add_3 = batch_norm(layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify)); add_4 = batch_norm(layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify)); aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var); cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1); hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify); network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid); layers.set_all_param_values(network, pickle.load(open('model_vals/deep_conv_classification_alt48_luad10_skcm10_lr0.py_e32_cv0.pkl', 'rb'))); all_params = layers.get_all_params(network, trainable=True); new_params = [x for x in all_params if x not in old_params]; return network, new_params, input_var, aug_var, target_var;
def build_1Dregression_v1(input_var=None, input_width=None, nin_units=12, h_num_units=[64, 64], h_grad_clip=1.0, output_width=1): """ A stacked bidirectional RNN network for regression, alternating with dense layers and merging of the two directions, followed by a feature mean pooling in the time direction, with a linear dim-reduction layer at the start Args: input_var (theano 3-tensor): minibatch of input sequence vectors input_width (int): length of input sequences nin_units (list): number of NIN features h_num_units (int list): no. of units in hidden layer in each stack from bottom to top h_grad_clip (float): gradient clipping maximum value output_width (int): size of output layer (e.g. =1 for 1D regression) Returns: output layer (Lasagne layer object) """ # Non-linearity hyperparameter nonlin = lasagne.nonlinearities.LeakyRectify(leakiness=0.15) # Input layer l_in = LL.InputLayer(shape=(None, 22, input_width), input_var=input_var) batchsize = l_in.input_var.shape[0] # NIN-layer l_in = LL.NINLayer(l_in, num_units=nin_units, nonlinearity=lasagne.nonlinearities.linear) l_in_1 = LL.DimshuffleLayer(l_in, (0, 2, 1)) # RNN layers for h in h_num_units: # Forward layers l_forward_0 = LL.RecurrentLayer(l_in_1, nonlinearity=nonlin, num_units=h, backwards=False, learn_init=True, grad_clipping=h_grad_clip, unroll_scan=True, precompute_input=True) l_forward_0a = LL.ReshapeLayer(l_forward_0, (-1, h)) l_forward_0b = LL.DenseLayer(l_forward_0a, num_units=h, nonlinearity=nonlin) l_forward_0c = LL.ReshapeLayer(l_forward_0b, (batchsize, input_width, h)) # Backward layers l_backward_0 = LL.RecurrentLayer(l_in_1, nonlinearity=nonlin, num_units=h, backwards=True, learn_init=True, grad_clipping=h_grad_clip, unroll_scan=True, precompute_input=True) l_backward_0a = LL.ReshapeLayer(l_backward_0, (-1, h)) l_backward_0b = LL.DenseLayer(l_backward_0a, num_units=h, nonlinearity=nonlin) l_backward_0c = LL.ReshapeLayer(l_backward_0b, (batchsize, input_width, h)) l_in_1 = LL.ElemwiseSumLayer([l_forward_0c, l_backward_0c]) # Output layers network_0a = LL.ReshapeLayer(l_in_1, (-1, h_num_units[-1])) network_0b = LL.DenseLayer(network_0a, num_units=output_width, nonlinearity=nonlin) network_0c = LL.ReshapeLayer(network_0b, (batchsize, input_width, output_width)) output_net_1 = LL.FlattenLayer(network_0c, outdim=2) output_net_2 = LL.FeaturePoolLayer(output_net_1, pool_size=input_width, pool_function=T.mean) return output_net_2
def run_experiment(args): import os # set environment variables for theano os.environ['THEANO_FLAGS'] = "lib.cnmem=" + str(args.mem) + ",device=gpu" + str(args.gpu) import threading import Queue import inspect import shutil import time import logging import six import collections import itertools import random import numpy as np import scipy import theano import theano.tensor as T import lasagne import lasagne.layers as ll import lasagne.nonlinearities as ln import parmesan import layers import utils import cfdataset #---------------------------------------------------------------- # Arguments and Settings floatX = theano.config.floatX logger = logging.getLogger() np.random.seed(args.seed) # copy file for reproducibility dirname = utils.setup_logging(args.message, args.loglv) script_src = os.path.abspath(inspect.getfile(inspect.currentframe())) script_dst = os.path.join(dirname, os.path.split(script_src)[1]) shutil.copyfile(script_src, script_dst) # print arguments args_dict = collections.OrderedDict(sorted(vars(args).items())) for k, v in six.iteritems(args_dict): logger.info(" %20s: %s" % (k, v)) # get arguments D_u, D_v = args.D_u, args.D_v lr = args.lr weight_decay = args.weight_decay lookahead = args.lookahead max_epoch = args.max_epoch batch_size_u, batch_size_v = args.batch_size_u, args.batch_size_v nonlin_enc = layers.get_nonlin(args.nonlin_enc) nonlin_dec = layers.get_nonlin(args.nonlin_dec) negative_ratio = args.negative_ratio #---------------------------------------------------------------- # Dataset dataset = cfdataset.CF_implicit_data(name=args.dataset) N_u, N_v = dataset.N_users, dataset.N_items T_matrix = dataset.T_matrix.astype(floatX) R_matrix = dataset.R_matrix.astype(floatX) R_negative_matrix = 1 - R_matrix assert np.all(R_matrix == (T_matrix > 0.5)) assert np.all((R_negative_matrix == 1) == (T_matrix == 0)) R_test = dataset.R_latest T_matrix[np.arange(N_u), R_test] = 0 R_matrix[np.arange(N_u), R_test] = 0 assert np.all(R_matrix == (T_matrix > 0.5)) R_matrix_for_test = R_matrix.copy() R_valid = dataset.R_2nd_latest T_matrix[np.arange(N_u), R_valid] = 0 R_matrix[np.arange(N_u), R_valid] = 0 assert np.all(R_matrix == (T_matrix > 0.5)) N_interaction = dataset.N_interaction - N_u * 2 assert np.all(R_valid != R_test) assert np.all(R_matrix_for_test[np.arange(N_u), R_valid] == 1) assert np.all(R_matrix_for_test[np.arange(N_u), R_test] == 0) assert np.all(R_matrix[np.arange(N_u), R_valid] == 0) assert np.all(R_matrix[np.arange(N_u), R_test] == 0) assert np.all(T_matrix[np.arange(N_u), R_valid] == 0) assert np.all(T_matrix[np.arange(N_u), R_test] == 0) assert N_interaction == np.count_nonzero(R_matrix) assert N_interaction + N_u == np.count_nonzero(R_matrix_for_test) logger.info("%d users, %d items, %d training interactions (%d total, 2 * %d held out for validation and test)." % (N_u, N_v, N_interaction, dataset.N_interaction, N_u)) #---------------------------------------------------------------- # numpy variables # encoded vectors np_enc_u_h = np.zeros((N_u, D_u), dtype=floatX) np_enc_v_h = np.zeros((N_v, D_v), dtype=floatX) #---------------------------------------------------------------- # Symbolic variables sym_lr = T.fscalar('lr') sym_Ru_pos = T.fmatrix('Ru_pos') sym_dr_Ru_pos = T.fscalar('dr_Ru_pos') sym_uid_origin_pos = T.ivector('uid_origin_pos') sym_uid_minibatch_pos = T.ivector('uid_minibatch_pos') sym_Ru_neg = T.fmatrix('Ru_neg') sym_dr_Ru_neg = T.fscalar('dr_Ru_neg') sym_uid_origin_neg = T.ivector('uid_origin_neg') sym_uid_minibatch_neg = T.ivector('uid_minibatch_neg') sym_Rv = T.fmatrix('Rv') sym_dr_Rv = T.fscalar('dr_Rv') sym_vid_origin_pos = T.ivector('vid_origin_pos') sym_vid_minibatch_pos = T.ivector('vid_minibatch_pos') sym_vid_origin_neg = T.ivector('vid_origin_neg') sym_vid_minibatch_neg = T.ivector('vid_minibatch_neg') sym_R_minibatch = T.fvector('R_minibatch') #---------------------------------------------------------------- # Model setup (training model) logger.info("Setting up model ...") # Input layers l_in_Ru_pos = ll.InputLayer((None, N_v), input_var=sym_Ru_pos, name='l_in_Ru_pos') l_in_uid_origin_pos = ll.InputLayer((None,), input_var=sym_uid_origin_pos, name='l_in_uid_origin_pos') l_in_uid_minibatch_pos = ll.InputLayer((None,), input_var=sym_uid_minibatch_pos, name='l_in_uid_minibatch_pos') l_in_Ru_neg = ll.InputLayer((None, N_v), input_var=sym_Ru_neg, name='l_in_Ru_neg') l_in_uid_origin_neg = ll.InputLayer((None,), input_var=sym_uid_origin_neg, name='l_in_uid_origin_neg') l_in_uid_minibatch_neg = ll.InputLayer((None,), input_var=sym_uid_minibatch_neg, name='l_in_uid_minibatch_neg') l_in_Rv = ll.InputLayer((None, N_u), input_var=sym_Rv, name='l_in_Rv') l_in_vid_origin_pos = ll.InputLayer((None,), input_var=sym_vid_origin_pos, name='l_in_vid_origin_pos') l_in_vid_minibatch_pos = ll.InputLayer((None,), input_var=sym_vid_minibatch_pos, name='l_in_vid_minibatch_pos') l_in_vid_origin_neg = ll.InputLayer((None,), input_var=sym_vid_origin_neg, name='l_in_vid_origin_neg') l_in_vid_minibatch_neg = ll.InputLayer((None,), input_var=sym_vid_minibatch_neg, name='l_in_vid_minibatch_neg') # Dropout layers l_in_Ru_pos = ll.DropoutLayer(l_in_Ru_pos, p=sym_dr_Ru_pos, rescale=False, name='Dropout-l_in_Ru_pos') l_in_Ru_neg = ll.DropoutLayer(l_in_Ru_neg, p=sym_dr_Ru_neg, rescale=False, name='Dropout-l_in_Ru_neg') l_in_Rv = ll.DropoutLayer(l_in_Rv, p=sym_dr_Rv, rescale=False, name='Dropout-l_in_Rv') # User encoder model h(Ru) l_enc_u_h_pos = ll.DenseLayer(l_in_Ru_pos, num_units=D_u, nonlinearity=nonlin_enc, name='l_enc_u_h_pos') l_enc_u_h_neg = ll.DenseLayer(l_in_Ru_neg, num_units=D_u, nonlinearity=nonlin_enc, W=l_enc_u_h_pos.W, b=l_enc_u_h_pos.b, name='l_enc_u_h_neg') # Item encoder model h(Rv) l_enc_v_h = ll.DenseLayer(l_in_Rv, num_units=D_v, nonlinearity=nonlin_enc, name='l_enc_v_h') # User decoder model s(h(Ru)) l_dec_u_s_pos = layers.SimpleDecodeLayer([l_enc_u_h_pos, l_in_vid_origin_pos, l_in_uid_minibatch_pos], num_units=N_v, nonlinearity=None, name='l_dec_u_s_pos') l_dec_u_s_neg = layers.SimpleDecodeLayer([l_enc_u_h_neg, l_in_vid_origin_neg, l_in_uid_minibatch_neg], num_units=N_v, V=l_dec_u_s_pos.V, Q=l_dec_u_s_pos.Q, b=l_dec_u_s_pos.b, nonlinearity=None, name='l_dec_u_s_neg') l_dec_u_s_all = ll.ConcatLayer([l_dec_u_s_pos ,l_dec_u_s_neg], axis=0) # Item decoder model s(h(Rv)) l_dec_v_s_pos = layers.SimpleDecodeLayer([l_enc_v_h, l_in_uid_origin_pos, l_in_vid_minibatch_pos], num_units=N_u, nonlinearity=None, name='l_dec_v_s_pos') l_dec_v_s_neg = layers.SimpleDecodeLayer([l_enc_v_h, l_in_uid_origin_neg, l_in_vid_minibatch_neg], num_units=N_u, V=l_dec_v_s_pos.V, Q=l_dec_v_s_pos.Q, b=l_dec_v_s_pos.b, nonlinearity=None, name='l_dec_v_s_neg') l_dec_v_s_all = ll.ConcatLayer([l_dec_v_s_pos ,l_dec_v_s_neg], axis=0) # Likelihood model p(R) l_uv_s_train = ll.ElemwiseSumLayer([l_dec_u_s_all, l_dec_v_s_all], name='l_uv_s_train') l_r_train = ll.NonlinearityLayer(l_uv_s_train, nonlinearity=ln.sigmoid, name='l_r_train') l_uv_s_test = ll.ElemwiseSumLayer([l_dec_u_s_pos, l_dec_v_s_pos], name='l_uv_s_test') l_r_test = ll.NonlinearityLayer(l_uv_s_test, nonlinearity=ln.sigmoid, name='l_r_test') #---------------------------------------------------------------- # Likelihood and RMSE # training p_r_train, = ll.get_output([l_r_train], deterministic=False) log_p_r = T.mean(parmesan.distributions.log_bernoulli(sym_R_minibatch, p_r_train, eps=1e-6)) regularization = lasagne.regularization.regularize_network_params([l_r_train], lasagne.regularization.l2) cost_function = - log_p_r + weight_decay * regularization SE_train = T.sum(T.sqr(sym_R_minibatch - p_r_train)) # test sym_enc_u_h = T.fmatrix('enc_u_h') sym_enc_v_h = T.fmatrix('enc_v_h') enc_u_h_out, enc_v_h_out = ll.get_output([l_enc_u_h_pos, l_enc_v_h], deterministic=True) p_r_test, = ll.get_output([l_r_test], inputs={l_enc_u_h_pos:sym_enc_u_h, l_enc_v_h:sym_enc_v_h}, deterministic=True) test_scores = p_r_test.reshape((-1, 101)) ranking = test_scores.argsort()[:,::-1].argmin(axis=1) #---------------------------------------------------------------- # Gradients clip_grad = 1 max_norm = 5 params = ll.get_all_params([l_r_train,], trainable=True) for p in params: logger.debug("%s: %s" % (p, p.get_value().shape)) grads = T.grad(cost_function, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] #updates = lasagne.updates.adam(cgrads, params, beta1=0.9, beta2=0.999, epsilon=1e-4, learning_rate=sym_lr) updates, sym_vars_list = utils.adam(cgrads, params, beta1=0.9, beta2=0.999, epsilon=1e-4, learning_rate=sym_lr) # moving average params_avg=[] for param in params: value = param.get_value(borrow=True) params_avg.append(theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable, name=param.name + '_avg')) avg_updates = [(a, a + 0.01 * (p - a)) for p, a in zip(params, params_avg)] avg_givens = [(p, a) for p, a in zip(params, params_avg)] all_updates = updates.items() + avg_updates #---------------------------------------------------------------- # Compile # training function logger.info("Compiling train_model ...") train_model = theano.function( inputs=[sym_lr, sym_uid_origin_pos, sym_uid_minibatch_pos, sym_vid_origin_pos, sym_vid_minibatch_pos, sym_uid_origin_neg, sym_uid_minibatch_neg, sym_vid_origin_neg, sym_vid_minibatch_neg, sym_Ru_pos, sym_Ru_neg, sym_Rv, sym_R_minibatch, sym_dr_Ru_pos, sym_dr_Ru_neg, sym_dr_Rv], outputs=[log_p_r, SE_train], updates=all_updates, ) # encoders logger.info("Compiling encode_model ...") u_encode_model = theano.function(inputs=[sym_Ru_pos], outputs=enc_u_h_out) v_encode_model = theano.function(inputs=[sym_Rv], outputs=enc_v_h_out) u_encode_avg_model = theano.function(inputs=[sym_Ru_pos], outputs=enc_u_h_out, givens=avg_givens, on_unused_input='ignore') v_encode_avg_model = theano.function(inputs=[sym_Rv], outputs=enc_v_h_out, givens=avg_givens, on_unused_input='ignore') # test function logger.info("Compiling test_model ...") test_model = theano.function( inputs=[sym_uid_origin_pos, sym_uid_minibatch_pos, sym_vid_origin_pos, sym_vid_minibatch_pos, sym_enc_u_h, sym_enc_v_h], outputs=[ranking], ) test_avg_model = theano.function( inputs=[sym_uid_origin_pos, sym_uid_minibatch_pos, sym_vid_origin_pos, sym_vid_minibatch_pos, sym_enc_u_h, sym_enc_v_h], outputs=[ranking], givens=avg_givens, on_unused_input='ignore', ) #---------------------------------------------------------------- # Predict function def compute_hidden_for(for_which_set='test', avg_model=False): assert for_which_set in ['valid', 'test'] if for_which_set == 'valid': R_matrix_cond = R_matrix else: R_matrix_cond = R_matrix_for_test # preconpute hidden representation u_end = 0 while u_end < N_u: u_start, u_end = u_end, min(u_end + batch_size_u, N_u) # create user mini-batch u_batch_ids = np.arange(u_start, u_end).astype('int32') # create conditionals Ru_minibatch = R_matrix_cond[u_batch_ids,:] # encode if avg_model: np_enc_u_h[u_batch_ids] = u_encode_avg_model(Ru_minibatch) else: np_enc_u_h[u_batch_ids] = u_encode_model(Ru_minibatch) v_end = 0 while v_end < N_v: v_start, v_end = v_end, min(v_end + batch_size_v, N_v) # create item mini-batch v_batch_ids = np.arange(v_start, v_end).astype('int32') # create conditionals Rv_minibatch = R_matrix_cond[:,v_batch_ids].T # encode if avg_model: np_enc_v_h[v_batch_ids] = v_encode_avg_model(Rv_minibatch) else: np_enc_v_h[v_batch_ids] = v_encode_model(Rv_minibatch) def predict_once(which_set='test', avg_model=False): assert which_set in ['valid', 'test'] if which_set == 'valid': R_predict = R_valid else: R_predict = R_test # test statistics rankings = [] # loop users u_end = 0 while u_end < N_u: u_start, u_end = u_end, min(u_end + batch_size_u, N_u) # create user mini-batch and item mini-batch u_batch_ids = np.arange(u_start, u_end).astype('int32') vid_negative = np.asarray([np.random.choice(np.where(row)[0], 100, replace=False) for row in R_negative_matrix[u_batch_ids]], dtype='int32') vid = np.concatenate([R_predict[u_batch_ids].reshape(-1,1), vid_negative], axis=1).flatten() uid_origin = np.repeat(u_batch_ids, 101) uid_minibatch = uid_origin - u_start # get encoded vectors Ru_encoded = np_enc_u_h[u_batch_ids] if avg_model: rankings_minibatch, = test_avg_model(uid_origin, uid_minibatch, vid, vid, Ru_encoded, np_enc_v_h) else: rankings_minibatch, = test_model(uid_origin, uid_minibatch, vid, vid, Ru_encoded, np_enc_v_h) rankings.append(rankings_minibatch) rankings = np.concatenate(rankings) HR = np.mean(rankings < 10) NDCG = np.mean((rankings < 10) / np.log2(rankings + 2)) return HR, NDCG def predict(which_set='test', avg=10, avg_model=False): compute_hidden_for(for_which_set=which_set, avg_model=avg_model) HR_list = [] NDCG_list = [] for i in range(avg): hr, ndcg = predict_once(which_set=which_set, avg_model=avg_model) HR_list.append(hr) NDCG_list.append(ndcg) HR_mean = np.mean(HR_list) NDCG_mean = np.mean(NDCG_list) HR_std = np.std(HR_list) NDCG_std = np.std(NDCG_list) # print info after test finished eval_msg = which_set if not avg_model else which_set + ' (avg model)' logger.critical("%-20s HR = %.3f +- %.3f, NDCG = %.3f +- %.3f." % (eval_msg, HR_mean, HR_std, NDCG_mean, NDCG_std)) return HR_mean, NDCG_mean #---------------------------------------------------------------- # Training best_valid_result = - np.inf best_model = None best_auxiliary = None n_epocs_without_improvement = 0 minibatch_queue = Queue.Queue(maxsize=10) # function for preparing minibatches def prepare_minibatch(minibatch_list): # loop mini-batches for u_batch_ids, v_batch_ids in minibatch_list: Rv_minibatch = R_matrix[:,v_batch_ids].T Rv_minibatch[:,u_batch_ids] = 0 Ru_minibatch_neg = R_matrix[u_batch_ids,:] #Ru_minibatch_neg[:,v_batch_ids] = 0 # create training samples mini-batch T_matrix_minibatch = T_matrix[np.ix_(u_batch_ids, v_batch_ids)] T_matrix_minibatch_sparse = scipy.sparse.coo_matrix(T_matrix_minibatch) n_interactions_minibatch = T_matrix_minibatch_sparse.count_nonzero() Ru_minibatch_pos = ((T_matrix[u_batch_ids[T_matrix_minibatch_sparse.row]] < T_matrix_minibatch_sparse.data.reshape(n_interactions_minibatch, 1)) & (T_matrix[u_batch_ids[T_matrix_minibatch_sparse.row]] > 0)).astype(floatX) uid_minibatch_pos = np.arange(n_interactions_minibatch).astype('int32') uid_origin_pos = u_batch_ids[T_matrix_minibatch_sparse.row] vid_minibatch_pos = T_matrix_minibatch_sparse.col vid_origin_pos = v_batch_ids[vid_minibatch_pos] R_matrix_negative_minibatch = 1 - R_matrix[np.ix_(u_batch_ids, v_batch_ids)] R_matrix_negative_minibatch_sparse = scipy.sparse.coo_matrix(R_matrix_negative_minibatch) n_negative_total = R_matrix_negative_minibatch_sparse.count_nonzero() assert n_negative_total + n_interactions_minibatch == u_batch_ids.size * v_batch_ids.size choice_negative = np.random.choice(n_negative_total, min(n_negative_total, np.int(n_interactions_minibatch * negative_ratio)), replace=False) uid_minibatch_neg = R_matrix_negative_minibatch_sparse.row[choice_negative] uid_origin_neg = u_batch_ids[uid_minibatch_neg] vid_minibatch_neg = R_matrix_negative_minibatch_sparse.col[choice_negative] vid_origin_neg = v_batch_ids[vid_minibatch_neg] R_minibatch = np.concatenate([np.ones_like(T_matrix_minibatch_sparse.data), R_matrix_negative_minibatch_sparse.data[choice_negative] * 0]) n_pred_step = R_minibatch.shape[0] if n_pred_step == 0: raise ValueError('No interactions in this minibatch.') dr_Ru_pos = min(max(1 - 2 * np.random.rand(), 0), 0.8) dr_Ru_neg = 0.2 dr_Rv = min(max(1 - 2 * np.random.rand(), 0), 0.8) # package everything into a tuple data_minibatch_package = ( uid_origin_pos, uid_minibatch_pos, vid_origin_pos, vid_minibatch_pos, uid_origin_neg, uid_minibatch_neg, vid_origin_neg, vid_minibatch_neg, Ru_minibatch_pos, Ru_minibatch_neg, Rv_minibatch, R_minibatch, dr_Ru_pos, dr_Ru_neg, dr_Rv) # enqueue minibatch_queue.put((n_pred_step, data_minibatch_package)) logger.warning("Training started.") # loop epoch for epoch in range(1, 1+max_epoch): epoch_start_time = time.time() # training statistics LL_epoch, SE_epoch= 0, 0 n_pred_epoch = 0 u_order = np.array_split(np.random.permutation(N_u).astype('int32'), N_u // batch_size_u + 1) v_order = np.array_split(np.random.permutation(N_v).astype('int32'), N_v // batch_size_v + 1) minibatch_order = list(itertools.product(u_order, v_order)) random.shuffle(minibatch_order) n_threads = 5 n_minibatch_thread = len(minibatch_order) // n_threads + 1 for t in range(n_threads): thr = threading.Thread(target=prepare_minibatch, args=(minibatch_order[t*n_minibatch_thread:(t+1)*n_minibatch_thread],)) thr.setDaemon(True) thr.start() for step in range(len(minibatch_order)): n_pred_step, data_minibatch_package = minibatch_queue.get() # update parameters and calculate likelihood and RMSE LL_step, SE_step = train_model(lr, *data_minibatch_package) minibatch_queue.task_done() LL_epoch += LL_step * n_pred_step SE_epoch += SE_step n_pred_epoch += n_pred_step assert minibatch_queue.qsize() == 0 # print info after epoch finished LL_epoch /= n_pred_epoch RMSE_epoch = np.sqrt(SE_epoch/n_pred_epoch) epoch_end_time = time.time() logger.info("Epoch %d, training RMSE = %f, LL = %f (%d training ratings). Elapsed time %.1fs." % (epoch, RMSE_epoch, LL_epoch, n_pred_epoch, epoch_end_time-epoch_start_time)) # validation HR_valid, NDCG_valid = predict('valid') HR_test, NDCG_test = predict('test') HR_test, NDCG_test = predict('test', avg_model=True) # termination #if NDCG_valid > best_valid_result: if HR_valid > best_valid_result: n_epocs_without_improvement = 0 #best_valid_result = NDCG_valid best_valid_result = HR_valid best_model = ll.get_all_param_values([l_r_train,], trainable=True) best_auxiliary = utils.get_all_shvar_values(sym_vars_list) logger.debug("New best model found!") else: n_epocs_without_improvement += 1 if n_epocs_without_improvement >= lookahead: ll.set_all_param_values([l_r_train,], best_model, trainable=True) utils.set_all_shvar_values(sym_vars_list, best_auxiliary) if lr > 1e-5: n_epocs_without_improvement = 0 lr /= 4 logger.error("Learning rate = %f now." % lr) else: logger.error("Training finished.") break #---------------------------------------------------------------- # Test HR_test, NDCG_test = predict('test') HR_test, NDCG_test = predict('test', avg_model=True) #---------------------------------------------------------------- # Summarization for k, v in six.iteritems(args_dict): logger.info(" %20s: %s" % (k, v))
def build_network_from_ae(classn): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 120, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 240, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 640, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 1024, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 640, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 100, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) mask_map = feat_map enlyr = feat_map layer = batch_norm( layers.Deconv2DLayer(enlyr, 1024, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 240, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 120, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 128, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(5, 5), stride=5, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) gllyr = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(gllyr, 256, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(9, 9), stride=5, crop=(2, 2), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) layers.set_all_param_values(network, pickle.load(open(filename_model_ae, 'rb'))) old_params = layers.get_all_params(network, trainable=True) # Adding more layers aug_var = T.matrix('aug_var') target_var = T.imatrix('targets') add_a = batch_norm( layers.Conv2DLayer(enlyr, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_b = batch_norm( layers.Conv2DLayer(add_a, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_c = batch_norm( layers.Conv2DLayer(add_b, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_d = batch_norm( layers.Conv2DLayer(add_c, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_0 = layers.Pool2DLayer(add_d, pool_size=(15, 15), stride=15, mode='average_inc_pad') add_1 = batch_norm( layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify)) add_2 = batch_norm( layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify)) add_3 = batch_norm( layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify)) add_4 = batch_norm( layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify)) aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var) cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1) hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify) network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid) all_params = layers.get_all_params(network, trainable=True) new_params = [x for x in all_params if x not in old_params] return network, new_params, input_var, aug_var, target_var
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2 * self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q]) # B x Q x 2D q = L.get_output(l_q) # B x Q x 2D q = q[T.arange(q.shape[0]), self.inps[12], :] # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) # B x Q x DE dd = L.get_output(l_doc_1) # B x N x DE M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1))) # B x N x Q alphas = T.nnet.softmax( T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2]))) alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \ self.inps[7][:,np.newaxis,:] # B x N x Q alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :, np.newaxis] # B x N x Q q_rep = T.batched_dot(alphas_r, qd) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, self.inps[4]) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, self.inps[4]) return final, final_v, l_doc, l_qs, l_docembed.W
def build_model(hyparams, vmap, log, nclasses=2, batchsize=None, invar=None, maskvar=None, maxlen=MAXLEN): embedding_dim = hyparams.embedding_dim nhidden = hyparams.nhidden bidirectional = hyparams.bidirectional pool = hyparams.pool grad_clip = hyparams.grad_clip init = hyparams.init net = OrderedDict() V = len(vmap) W = lasagne.init.Normal() gate_params = layer.recurrent.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh) net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar) net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar) ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)} net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W) net['fwd1'] = layer.LSTMLayer(net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True) if bidirectional: net['bwd1'] = layer.LSTMLayer(net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, backwards=True) if pool == 'mean': def tmean(a, b): agg = theano.tensor.add(a, b) agg /= 2. return agg net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean) elif pool == 'sum': net['pool'] = layer.ElemwiseSumLayer([net['fwd1'], net['bwd1']]) else: net['pool'] = layer.ConcatLayer([net['fwd1'], net['bwd1']]) else: net['pool'] = layer.ConcatLayer([net['fwd1']]) net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5) if init == 'identity': gate_params2 = layer.recurrent.Gate(W_in=np.eye(nhidden, dtype=np.float32), W_hid=np.eye(nhidden, dtype=np.float32), b=lasagne.init.Constant(0.)) cell_params2 = layer.recurrent.Gate( W_in=np.eye(nhidden, dtype=np.float32), W_hid=np.eye(nhidden, dtype=np.float32), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) net['fwd2'] = layer.LSTMLayer(net['dropout1'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params2, forgetgate=gate_params2, cell=cell_params2, outgate=gate_params2, learn_init=True, only_return_final=True) else: net['fwd2'] = layer.LSTMLayer(net['dropout1'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, only_return_final=True) net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6) net['softmax'] = layer.DenseLayer( net['dropout2'], num_units=nclasses, nonlinearity=lasagne.nonlinearities.softmax) logstr = '========== MODEL ========== \n' logstr += 'vocab size: %d\n' % V logstr += 'embedding dim: %d\n' % embedding_dim logstr += 'nhidden: %d\n' % nhidden logstr += 'pooling: %s\n' % pool for lname, lyr in net.items(): logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME))) logstr += '=========================== \n' print logstr log.write(logstr) log.flush() return net
def build_rnn_net(input_var=None, input_width=None, input_dim=None, nin_units=80, h_num_units=[64, 64], h_grad_clip=1.0, output_width=1): """ A stacked bidirectional RNN network for regression, alternating with dense layers and merging of the two directions, followed by a feature mean pooling in the time direction, with a linear dim-reduction layer at the start add dropout for generalizations Args: input_var (theano 3-tensor): minibatch of input sequence vectors input_width (int): length of input sequences nin_units (list): number of NIN features h_num_units (int list): no. of units in hidden layer in each stack from bottom to top h_grad_clip (float): gradient clipping maximum value output_width (int): size of output layer (e.g. =1 for 1D regression) Returns: output layer (Lasagne layer object) """ # Non-linearity hyperparameter leaky_ratio = 0.3 nonlin = lasagne.nonlinearities.LeakyRectify(leakiness=leaky_ratio) # Input layer l_in = LL.InputLayer(shape=(None, input_width, input_dim), input_var=input_var) batchsize = l_in.input_var.shape[0] # NIN-layer #l_in_1 = LL.NINLayer(l_in, num_units=nin_units, #nonlinearity=lasagne.nonlinearities.linear) l_in_1 = l_in #l_in_d = LL.DropoutLayer(l_in, p = 0.8) Do not use drop out now, for the first rnn layer is 256 # currently, we do not drop features # RNN layers # dropout in the first two (total three) or three (total five) layers counter = -1 drop_ends = 2 for h in h_num_units: counter += 1 # Forward layers l_forward_0 = LL.RecurrentLayer( l_in_1, nonlinearity=nonlin, num_units=h, W_in_to_hid=lasagne.init.Normal(0.01, 0), #W_in_to_hid=lasagne.init.He(initializer, math.sqrt(2/(1+0.15**2))), W_hid_to_hid=lasagne.init.Orthogonal( math.sqrt(2 / (1 + leaky_ratio**2))), backwards=False, learn_init=True, grad_clipping=h_grad_clip, #gradient_steps = 20, unroll_scan=True, precompute_input=True) l_forward_0a = LL.ReshapeLayer(l_forward_0, (-1, h)) if (counter < drop_ends and counter % 2 != 0): l_forward_0a = LL.DropoutLayer(l_forward_0a, p=0.2) else: l_forward_0a = l_forward_0a l_forward_0b = LL.DenseLayer(l_forward_0a, num_units=h, nonlinearity=nonlin) l_forward_0c = LL.ReshapeLayer(l_forward_0b, (batchsize, input_width, h)) l_forward_out = l_forward_0c # Backward layers l_backward_0 = LL.RecurrentLayer( l_in_1, nonlinearity=nonlin, num_units=h, W_in_to_hid=lasagne.init.Normal(0.01, 0), #W_in_to_hid=lasagne.init.He(initializer, math.sqrt(2/(1+0.15**2))), W_hid_to_hid=lasagne.init.Orthogonal( math.sqrt(2 / (1 + leaky_ratio**2))), backwards=True, learn_init=True, grad_clipping=h_grad_clip, #gradient_steps = 20, unroll_scan=True, precompute_input=True) l_backward_0a = LL.ReshapeLayer(l_backward_0, (-1, h)) if (counter < drop_ends and counter % 2 == 0): l_backward_0a = LL.DropoutLayer(l_backward_0a, p=0.2) else: l_backward_0a = l_backward_0a l_backward_0b = LL.DenseLayer(l_backward_0a, num_units=h, nonlinearity=nonlin) l_backward_0c = LL.ReshapeLayer(l_backward_0b, (batchsize, input_width, h)) l_backward_out = l_backward_0c l_in_1 = LL.ElemwiseSumLayer([l_forward_out, l_backward_out]) # Output layers network_0a = LL.DenseLayer(l_in_1, num_units=1, num_leading_axes=2, nonlinearity=nonlin) output_net = LL.FlattenLayer(network_0a, outdim=2) return output_net
def build_RNN(self, n_hidden_list=(100, ), bidirectional=False, addDenseLayers=False, seed=int(time.time()), debug=False, logger=logger_RNNtools): # some inspiration from http://colinraffel.com/talks/hammer2015recurrent.pdf # if debug: # logger_RNNtools.debug('\nInputs:'); # logger_RNNtools.debug(' X.shape: %s', self.X[0].shape) # logger_RNNtools.debug(' X[0].shape: %s %s %s \n%s', self.X[0][0].shape, type(self.X[0][0]), # type(self.X[0][0][0]), self.X[0][0][:5]) # # logger_RNNtools.debug('Targets: '); # logger_RNNtools.debug(' Y.shape: %s', self.Y.shape) # logger_RNNtools.debug(' Y[0].shape: %s %s %s \n%s', self.Y[0].shape, type(self.Y[0]), type(self.Y[0][0]), # self.Y[0][:5]) # logger_RNNtools.debug('Layers: ') # fix these at initialization because it allows for compiler opimizations num_output_units = self.num_output_units num_features = self.num_features batch_size = self.batch_size audio_inputs = self.audio_inputs_var audio_masks = self.audio_masks_var #set MATRIX, not iMatrix!! Otherwise all mask calculations are done by CPU, and everything will be ~2x slowed down!! Also in general_tools.generate_masks() valid_indices = self.audio_valid_indices_var net = {} # net['l1_in_valid'] = L.InputLayer(shape=(batch_size, None), input_var=valid_indices) # shape = (batch_size, batch_max_seq_length, num_features) net['l1_in'] = L.InputLayer(shape=(batch_size, None, num_features), input_var=audio_inputs) # We could do this and set all input_vars to None, but that is slower -> fix batch_size and num_features at initialization # batch_size, n_time_steps, n_features = net['l1_in'].input_var.shape # This input will be used to provide the network with masks. # Masks are matrices of shape (batch_size, n_time_steps); net['l1_mask'] = L.InputLayer(shape=(batch_size, None), input_var=audio_masks) if debug: get_l_in = L.get_output(net['l1_in']) l_in_val = get_l_in.eval({net['l1_in'].input_var: self.X}) # logger_RNNtools.debug(l_in_val) logger_RNNtools.debug(' l_in size: %s', l_in_val.shape) get_l_mask = L.get_output(net['l1_mask']) l_mask_val = get_l_mask.eval( {net['l1_mask'].input_var: self.masks}) # logger_RNNtools.debug(l_in_val) logger_RNNtools.debug(' l_mask size: %s', l_mask_val.shape) n_batch, n_time_steps, n_features = net['l1_in'].input_var.shape logger_RNNtools.debug( " n_batch: %s | n_time_steps: %s | n_features: %s", n_batch, n_time_steps, n_features) ## LSTM parameters # All gates have initializers for the input-to-gate and hidden state-to-gate # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity. # The convention is that gates use the standard sigmoid nonlinearity, # which is the default for the Gate class. gate_parameters = L.recurrent.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_parameters = L.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=lasagne.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=lasagne.nonlinearities.tanh) # generate layers of stacked LSTMs, possibly bidirectional net['l2_lstm'] = [] for i in range(len(n_hidden_list)): n_hidden = n_hidden_list[i] if i == 0: input = net['l1_in'] else: input = net['l2_lstm'][i - 1] nextForwardLSTMLayer = L.recurrent.LSTMLayer( input, n_hidden, # We need to specify a separate input for masks mask_input=net['l1_mask'], # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=100.) net['l2_lstm'].append(nextForwardLSTMLayer) if bidirectional: input = net['l2_lstm'][-1] # Use backward LSTM # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. nextBackwardLSTMLayer = L.recurrent.LSTMLayer( input, n_hidden, ingate=gate_parameters, mask_input=net['l1_mask'], forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=100., backwards=True) net['l2_lstm'].append(nextBackwardLSTMLayer) # if debug: # # Backwards LSTM # get_l_lstm_back = theano.function([net['l1_in'].input_var, net['l1_mask'].input_var], # L.get_output(net['l2_lstm'][-1])) # l_lstmBack_val = get_l_lstm_back(self.X, self.masks) # logger_RNNtools.debug(' l_lstm_back size: %s', l_lstmBack_val.shape) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. # The output of l_sum will be of shape (n_batch, max_n_time_steps, n_features) net['l2_lstm'].append( L.ElemwiseSumLayer( [net['l2_lstm'][-2], net['l2_lstm'][-1]])) # we need to convert (batch_size, seq_length, num_features) to (batch_size * seq_length, num_features) because Dense networks can't deal with 2 unknown sizes net['l3_reshape'] = L.ReshapeLayer(net['l2_lstm'][-1], (-1, n_hidden_list[-1])) # if debug: # get_l_reshape = theano.function([net['l1_in'].input_var, net['l1_mask'].input_var], # L.get_output(net['l3_reshape'])) # l_reshape_val = get_l_reshape(self.X, self.masks) # logger.debug(' l_reshape size: %s', l_reshape_val.shape) # # if debug: # # Forwards LSTM # get_l_lstm = theano.function([net['l1_in'].input_var, net['l1_mask'].input_var], # L.get_output(net['l2_lstm'][-1])) # l_lstm_val = get_l_lstm(self.X, self.masks) # logger_RNNtools.debug(' l2_lstm size: %s', l_lstm_val.shape); if addDenseLayers: net['l4_dense'] = L.DenseLayer( net['l3_reshape'], nonlinearity=lasagne.nonlinearities.rectify, num_units=256) dropoutLayer = L.DropoutLayer(net['l4_dense'], p=0.3) net['l5_dense'] = L.DenseLayer( dropoutLayer, nonlinearity=lasagne.nonlinearities.rectify, num_units=64) # Now we can apply feed-forward layers as usual for classification net['l6_dense'] = L.DenseLayer( net['l5_dense'], num_units=num_output_units, nonlinearity=lasagne.nonlinearities.softmax) else: # Now we can apply feed-forward layers as usual for classification net['l6_dense'] = L.DenseLayer( net['l3_reshape'], num_units=num_output_units, nonlinearity=lasagne.nonlinearities.softmax) # # Now, the shape will be (n_batch * n_timesteps, num_output_units). We can then reshape to # # n_batch to get num_output_units values for each timestep from each sequence net['l7_out_flattened'] = L.ReshapeLayer(net['l6_dense'], (-1, num_output_units)) net['l7_out'] = L.ReshapeLayer(net['l6_dense'], (batch_size, -1, num_output_units)) net['l7_out_valid_basic'] = L.SliceLayer(net['l7_out'], indices=valid_indices, axis=1) net['l7_out_valid'] = L.ReshapeLayer( net['l7_out_valid_basic'], (batch_size, -1, num_output_units)) net['l7_out_valid_flattened'] = L.ReshapeLayer( net['l7_out_valid_basic'], (-1, num_output_units)) if debug: get_l_out = theano.function( [net['l1_in'].input_var, net['l1_mask'].input_var], L.get_output(net['l7_out'])) l_out = get_l_out(self.X, self.masks) # this only works for batch_size == 1 get_l_out_valid = theano.function( [audio_inputs, audio_masks, valid_indices], L.get_output(net['l7_out_valid'])) try: l_out_valid = get_l_out_valid(self.X, self.masks, self.valid_frames) logger_RNNtools.debug('\n\n\n l_out: %s | l_out_valid: %s', l_out.shape, l_out_valid.shape) except: logger_RNNtools.warning( "batchsize not 1, get_valid not working") if debug: self.print_network_structure(net) self.network_lout = net['l7_out_flattened'] self.network_lout_batch = net['l7_out'] self.network_lout_valid = net['l7_out_valid'] self.network_lout_valid_flattened = net['l7_out_valid_flattened'] self.network = net
def resblock(net_in, filters, kernel_size, stride=1, num_groups=1, preactivated=True): # Preactivation net_pre = batch_norm(net_in) net_pre = l.NonlinearityLayer(net_pre, nonlinearity=nonlinearity(cfg.NONLINEARITY)) # Preactivated shortcut? if preactivated: net_sc = net_pre else: net_sc = net_in # Stride size if cfg.MAX_POOLING: s = 1 else: s = stride # First Convolution (alwys has preactivated input) net = batch_norm( l.Conv2DLayer(net_pre, num_filters=filters, filter_size=kernel_size, pad='same', stride=s, num_groups=num_groups, W=initialization(cfg.NONLINEARITY), nonlinearity=nonlinearity(cfg.NONLINEARITY))) # Optional pooling layer if cfg.MAX_POOLING and stride > 1: net = l.MaxPool2DLayer(net, pool_size=stride) # Dropout Layer (we support different types of dropout) if cfg.DROPOUT_TYPE == 'channels' and cfg.DROPOUT > 0.0: net = l.dropout_channels(net, p=cfg.DROPOUT) elif cfg.DROPOUT_TYPE == 'location' and cfg.DROPOUT > 0.0: net = l.dropout_location(net, p=cfg.DROPOUT) elif cfg.DROPOUT > 0.0: net = l.DropoutLayer(net, p=cfg.DROPOUT) # Second Convolution net = l.Conv2DLayer(net, num_filters=filters, filter_size=kernel_size, pad='same', stride=1, num_groups=num_groups, W=initialization(cfg.NONLINEARITY), nonlinearity=None) # Shortcut Layer if not l.get_output_shape(net) == l.get_output_shape(net_sc): shortcut = l.Conv2DLayer(net_sc, num_filters=filters, filter_size=1, pad='same', stride=s, W=initialization(cfg.NONLINEARITY), nonlinearity=None, b=None) # Optional pooling layer if cfg.MAX_POOLING and stride > 1: shortcut = l.MaxPool2DLayer(shortcut, pool_size=stride) else: shortcut = net_sc # Merge Layer out = l.ElemwiseSumLayer([net, shortcut]) return out
def make_model(): image = ll.InputLayer((BS, CH, IH, IW), name='step1.image') h_read_init = ll.InputLayer( (HS, ), lasagne.utils.create_param(li.Uniform(), (HS, ), name='step1.tensor.h_read_init'), name='step1.h_read_init') h_read_init.add_param(h_read_init.input_var, (HS, )) h_write_init = ll.InputLayer( (HS, ), lasagne.utils.create_param(li.Uniform(), (HS, ), name='step1.tensor.h_write_init'), name='step1.h_write_init') h_write_init.add_param(h_write_init.input_var, (HS, )) h_read = ll.ExpressionLayer(h_read_init, lambda t: T.tile(T.reshape(t, (1, HS)), (BS, 1)), (BS, HS), name='step1.h_read') h_write = ll.ExpressionLayer(h_write_init, lambda t: T.tile(T.reshape(t, (1, HS)), (BS, 1)), (BS, HS), name='step1.h_write') canvas = ll.InputLayer( (BS, CH, IH, IW), lasagne.utils.create_param(li.Constant(0.0), (BS, CH, IH, IW), name='step1.tensor.canvas'), name='step1.canvas') image_prev = ll.NonlinearityLayer(canvas, ln.sigmoid, name='step1.image_prev') image_error = ll.ElemwiseSumLayer([image, image_prev], coeffs=[1, -1], name='step1.image_error') image_stack = ll.ConcatLayer([image, image_error], name='step1.image_stack') read_params = ll.DenseLayer(h_write, 6, nonlinearity=None, name='step1.read_params') read_window = advanced_layers.AttentionLayer([read_params, image_stack], (WH, WW), name='step1.read_window') read_flat = ll.FlattenLayer(read_window, name='step1.read_flat') read_code = ll.ConcatLayer([read_flat, h_write], name='step1.read_code') read_code_sequence = ll.ReshapeLayer(read_code, (BS, 1, read_code.output_shape[-1]), name='step1.read_code_sequence') read_rnn = ll.GRULayer( read_code_sequence, HS, only_return_final=True, hid_init=h_read, name='step1.read_rnn', ) sample_mean = ll.DenseLayer(read_rnn, ENC_NDIM, nonlinearity=None, name='step1.sample_mean') sample_logvar2 = ll.DenseLayer(read_rnn, ENC_NDIM, nonlinearity=None, name='step1.sample_logvar2') sample = advanced_layers.SamplingLayer([sample_mean, sample_logvar2], ENC_VAR, name='step1.sample') write_code = ll.DenseLayer(sample, HS, name='step1.write_code') write_code_sequence = ll.ReshapeLayer(write_code, (BS, 1, write_code.output_shape[-1]), name='step1.write_code_sequence') write_rnn = ll.GRULayer( write_code_sequence, HS, only_return_final=True, hid_init=h_write, name='step1.write_rnn', ) write_window_flat = ll.DenseLayer(write_rnn, CH * WH * WW, name='step1.write_window_flat') write_window = ll.ReshapeLayer(write_window_flat, (BS, CH, WH, WW), name='step1.write_window') write_params = ll.DenseLayer(h_write, 6, nonlinearity=None, name='step1.write_params') write_image = advanced_layers.AttentionLayer([write_params, write_window], (IH, IW), name='step1.write_image') canvas_next = ll.ElemwiseSumLayer([canvas, write_image], name='step1.canvas_next') def rename(name): if name is None: return None step, real_name = name.split('.', 1) step = int(step[4:]) return 'step%d.%s' % (step + 1, real_name) for step in xrange(1, TIME_ROUNDS): sample_random_variable_next = sample.random_stream.normal( sample.input_shapes[0], std=sample.variation_coeff, ) sample_random_variable_next.name = 'step%d.sample.random_variable' % \ (step + 1) canvas, canvas_next = (canvas_next, utils.modified_copy( canvas_next, modify={ h_read: read_rnn, h_write: write_rnn, canvas: canvas_next, sample.random_stream: sample.random_stream, sample.random_variable: sample_random_variable_next, }, rename=rename, )) h_read = read_rnn h_write = write_rnn read_rnn = utils.layer_by_name(canvas_next, 'step%d.read_rnn' % (step + 1)) write_rnn = utils.layer_by_name(canvas_next, 'step%d.write_rnn' % (step + 1)) sample = utils.layer_by_name(canvas_next, 'step%d.sample' % (step + 1)) output = ll.NonlinearityLayer(canvas_next, ln.sigmoid, name='output') return output
def resblock(net_in, filters, kernel_size, stride=1, preactivated=True, block_id=1, name=''): # Show input shape #log.p(("\t\t" + name + " IN SHAPE:", l.get_output_shape(net_in)), new_line=False) # Pre-activation if block_id > 1: net_pre = l.NonlinearityLayer(net_in, nonlinearity=nl.rectify) else: net_pre = net_in # Pre-activated shortcut? if preactivated: net_in = net_pre # Bottleneck Convolution if stride > 1: net_pre = l.batch_norm(l.Conv2DLayer(net_pre, num_filters=l.get_output_shape(net_pre)[1], filter_size=1, pad='same', stride=1, nonlinearity=nl.rectify)) # First Convolution net = l.batch_norm(l.Conv2DLayer(net_pre, num_filters=l.get_output_shape(net_pre)[1], filter_size=kernel_size, pad='same', stride=1, nonlinearity=nl.rectify)) # Pooling layer if stride > 1: net = l.MaxPool2DLayer(net, pool_size=(stride, stride)) # Dropout Layer net = l.DropoutLayer(net) # Second Convolution net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters, filter_size=kernel_size, pad='same', stride=1, nonlinearity=None)) # Shortcut Layer if not l.get_output_shape(net) == l.get_output_shape(net_in): # Average pooling shortcut = l.Pool2DLayer(net_in, pool_size=(stride, stride), stride=stride, mode='average_exc_pad') # Shortcut convolution shortcut = l.batch_norm(l.Conv2DLayer(shortcut, num_filters=filters, filter_size=1, pad='same', stride=1, nonlinearity=None)) else: # Shortcut = input shortcut = net_in # Merge Layer out = l.ElemwiseSumLayer([net, shortcut]) # Show output shape #log.p(("OUT SHAPE:", l.get_output_shape(out), "LAYER:", len(l.get_all_layers(out)) - 1)) return out
def get_actor(self, avg=False): suf = '_avg' if avg else '' iw = L.InputLayer(shape=(None, self.args.sw)) # (100, 24) ew = L.EmbeddingLayer( iw, self.args.vw, self.args.nw, name='ew' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 256) ew.params[ew.W].remove('regularizable') if 'w' in self.args.freeze: ew.params[ew.W].remove('trainable') # for access from outside if not avg: self.Ew = ew.W # char embedding with CNN/LSTM ic = L.InputLayer(shape=(None, self.args.sw, self.args.max_len)) # (100, 24, 32) ec = self.get_char2word(ic, avg) # (100, 24, 256) it = L.InputLayer(shape=(None, self.args.st)) et = L.EmbeddingLayer(it, self.args.vt, self.args.nt, name='et' + suf, W=HeNormal() if not avg else Constant()) et.params[et.W].remove('regularizable') il = L.InputLayer(shape=(None, self.args.sl)) el = L.EmbeddingLayer(il, self.args.vl, self.args.nl, name='el' + suf, W=HeNormal() if not avg else Constant()) el.params[el.W].remove('regularizable') to_concat = [] if self.args.type == 'word': to_concat.append(ew) elif self.args.type == 'char': to_concat.append(ec) elif self.args.type == 'both': to_concat += [ew, ec] elif self.args.type == 'mix': to_concat.append(L.ElemwiseSumLayer([ew, ec])) if not self.args.untagged: to_concat.append(et) if not self.args.unlabeled: to_concat.append(el) x = L.concat(to_concat, axis=2) # (100, 24, 64+16+16) # additional: # get the more compact representation of each token by its word, tag and label, # before putting into the hidden layer if self.args.squeeze: x = L.DenseLayer( x, num_units=self.args.squeeze, name='h0' + suf, num_leading_axes=2, W=HeNormal('relu') if not avg else Constant()) # (100, 24, 64) h1 = L.DenseLayer( x, num_units=self.args.nh1, name='h1' + suf, W=HeNormal('relu') if not avg else Constant()) # (100, 512) h1 = L.dropout(h1, self.args.p1) h2 = L.DenseLayer( h1, num_units=self.args.nh2, name='h2' + suf, W=HeNormal('relu') if not avg else Constant()) # (100, 256) h2 = L.dropout(h2, self.args.p2) h3 = L.DenseLayer(h2, num_units=self.args.nh3, name='h3' + suf, W=HeNormal() if not avg else Constant(), nonlinearity=softmax) # (100, 125) num of actions return iw, ic, it, il, h3
def build_segmenter_jet_2(): # downsample down to a small region, then upsample all the way back up, using jet architecture # recreate basic FCN-8s structure (though more aptly 1s here since we upsample back to the original input size) # this jet will have another conv layer in the final upsample inp = ll.InputLayer(shape=(None, 1, None, None), name='input') conv1 = ll.Conv2DLayer(inp, num_filters=32, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv1_1') bn1 = ll.BatchNormLayer(conv1, name='bn1') conv2 = ll.Conv2DLayer(bn1, num_filters=64, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv1_2') bn2 = ll.BatchNormLayer(conv2, name='bn2') mp1 = ll.MaxPool2DLayer(bn2, 2, stride=2, name='mp1') # 2x downsample conv3 = ll.Conv2DLayer(mp1, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv2_1') bn3 = ll.BatchNormLayer(conv3, name='bn3') conv4 = ll.Conv2DLayer(bn3, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv2_2') bn4 = ll.BatchNormLayer(conv4, name='bn4') mp2 = ll.MaxPool2DLayer(bn4, 2, stride=2, name='mp2') # 4x downsample conv5 = ll.Conv2DLayer(mp2, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv3_1') bn5 = ll.BatchNormLayer(conv5, name='bn5') conv6 = ll.Conv2DLayer(bn5, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv3_2') bn6 = ll.BatchNormLayer(conv6, name='bn6') mp3 = ll.MaxPool2DLayer(bn6, 2, stride=2, name='mp3') # 8x downsample conv7 = ll.Conv2DLayer(mp3, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv4_1') bn7 = ll.BatchNormLayer(conv7, name='bn7') conv8 = ll.Conv2DLayer(bn7, num_filters=128, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=rectify, name='conv4_2') bn8 = ll.BatchNormLayer(conv8, name='bn8') # f 68 s 8 # now start the upsample ## FIRST UPSAMPLE PREDICTION (akin to FCN-32s) conv_f8 = ll.Conv2DLayer(bn8, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_8xpred') softmax_8 = Softmax4D(conv_f8, name='4dsoftmax_8x') up8 = ll.Upscale2DLayer( softmax_8, 8, name='upsample_8x') # take loss here, 8x upsample from 8x downsample ## COMBINE BY UPSAMPLING SOFTMAX 8 AND PRED ON CONV 6 softmax_4up = ll.Upscale2DLayer(softmax_8, 2, name='upsample_4x_pre') # 4x downsample conv_f6 = ll.Conv2DLayer(bn6, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_4xpred') softmax_4 = Softmax4D(conv_f6, name='4dsoftmax_4x') # 4x downsample softmax_4_merge = ll.ElemwiseSumLayer([softmax_4, softmax_4up], coeffs=0.5, name='softmax_4_merge') up4 = ll.Upscale2DLayer( softmax_4_merge, 4, name='upsample_4x') # take loss here, 4x upsample from 4x downsample ## COMBINE BY UPSAMPLING SOFTMAX_4_MERGE AND CONV 4 softmax_2up = ll.Upscale2DLayer(softmax_4_merge, 2, name='upsample_2x_pre') # 2x downsample conv_f4 = ll.Conv2DLayer(bn4, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_2xpred') softmax_2 = Softmax4D(conv_f4, name='4dsoftmax_2x') softmax_2_merge = ll.ElemwiseSumLayer([softmax_2, softmax_2up], coeffs=0.5, name='softmax_2_merge') up2 = ll.Upscale2DLayer( softmax_2_merge, 2, name='upsample_2x' ) # final loss here, 2x upsample from a 2x downsample ## COMBINE BY UPSAMPLING SOFTMAX_2_MERGE AND CONV 2 softmax_1up = ll.Upscale2DLayer( softmax_2_merge, 2, name='upsample_1x_pre') # 1x downsample (i.e. no downsample) conv_f2 = ll.Conv2DLayer(bn2, num_filters=2, filter_size=(3, 3), pad='same', W=Orthogonal(), nonlinearity=linear, name='conv_1xpred') softmax_1 = Softmax4D(conv_f2, name='4dsoftmax_1x') softmax_1_merge = ll.ElemwiseSumLayer([softmax_1, softmax_1up], coeffs=0.5, name='softmax_1_merge') # this is where up1 would go but that doesn't make any sense return [up8, up4, up2, softmax_1_merge]
def run_experiment(args): import os # set environment variables for theano os.environ['THEANO_FLAGS'] = "lib.cnmem=" + str( args.mem) + ",device=gpu" + str(args.gpu) import inspect import shutil import time import logging import six import collections import numpy as np import scipy import theano import theano.tensor as T import lasagne import lasagne.layers as ll import lasagne.nonlinearities as ln import parmesan import layers import utils import cfdataset #---------------------------------------------------------------- # Arguments and Settings floatX = theano.config.floatX logger = logging.getLogger() np.random.seed(args.seed) # copy file for reproducibility dirname = utils.setup_logging(args.message, args.loglv) script_src = os.path.abspath(inspect.getfile(inspect.currentframe())) script_dst = os.path.join(dirname, os.path.split(script_src)[1]) shutil.copyfile(script_src, script_dst) # print arguments args_dict = collections.OrderedDict(sorted(vars(args).items())) for k, v in six.iteritems(args_dict): logger.info(" %20s: %s" % (k, v)) # get arguments D_u, D_v = args.D_u, args.D_v J_u, J_v = args.J_u, args.J_v lr = args.lr alpha = args.alpha weight_decay = args.weight_decay n_step = args.n_step lookahead = args.lookahead max_epoch = args.max_epoch batch_size_u, batch_size_v = args.batch_size_u, args.batch_size_v share_params = not args.no_share_params nonlin_enc = layers.get_nonlin(args.nonlin_enc) nonlin_dec = layers.get_nonlin(args.nonlin_dec) #---------------------------------------------------------------- # Dataset dataset = cfdataset.CFdata(name=args.dataset, split=args.split) N_stars = dataset.N_stars N_u, N_v = dataset.N_users, dataset.N_items R_train = dataset.R_train # int (3 * N_train_rating) R_test = dataset.R_test # int (3 * N_test_rating) n_valid_split = np.int(dataset.N_train_rating / 20) train_valid_perm = np.random.permutation(dataset.N_train_rating) R_valid = R_train[:, train_valid_perm[:n_valid_split]] R_train = R_train[:, train_valid_perm[n_valid_split:]] R_matrix = dict() R_matrix['train'] = scipy.sparse.coo_matrix( (R_train[2], (R_train[0], R_train[1])), shape=(N_u, N_v)).toarray().astype('int32') R_matrix['valid'] = scipy.sparse.coo_matrix( (R_valid[2], (R_valid[0], R_valid[1])), shape=(N_u, N_v)).toarray().astype('int32') R_matrix['test'] = scipy.sparse.coo_matrix( (R_test[2], (R_test[0], R_test[1])), shape=(N_u, N_v)).toarray().astype('int32') N_rating = dict() N_rating['train'] = dataset.N_train_rating - n_valid_split N_rating['valid'] = n_valid_split N_rating['test'] = dataset.N_test_rating logger.info("%d users, %d items" % (N_u, N_v)) logger.info("%d training ratings, %d validation ratings, %d test ratings" % (N_rating['train'], N_rating['valid'], N_rating['test'])) logger.info("%d-star scale" % N_stars) #---------------------------------------------------------------- # numpy variables # encoded vectors np_enc_u_h = np.zeros((N_u, D_u), dtype=floatX) np_enc_v_h = np.zeros((N_v, D_v), dtype=floatX) #---------------------------------------------------------------- # Symbolic variables sym_lr = T.fscalar('lr') sym_Ru = T.imatrix('Ru') sym_Rv = T.imatrix('Rv') sym_dr_Ru = T.fscalar('dr_Ru') sym_dr_Rv = T.fscalar('dr_Rv') sym_uid_origin = T.ivector('uid_origin') sym_uid_minibatch = T.ivector('uid_minibatch') sym_vid_origin = T.ivector('vid_origin') sym_vid_minibatch = T.ivector('vid_minibatch') sym_R_minibatch = T.ivector('R_minibatch') #---------------------------------------------------------------- # Model setup (training model) logger.info("Setting up model ...") # Input layers l_in_Ru = ll.InputLayer((None, N_v), input_var=sym_Ru, name='l_in_Ru') l_in_Rv = ll.InputLayer((None, N_u), input_var=sym_Rv, name='l_in_Rv') l_in_uid_origin = ll.InputLayer((None, ), input_var=sym_uid_origin, name='l_in_uid_origin') l_in_vid_origin = ll.InputLayer((None, ), input_var=sym_vid_origin, name='l_in_vid_origin') l_in_uid_minibatch = ll.InputLayer((None, ), input_var=sym_uid_minibatch, name='l_in_uid_minibatch') l_in_vid_minibatch = ll.InputLayer((None, ), input_var=sym_vid_minibatch, name='l_in_vid_minibatch') # Dropout layers l_in_Ru = ll.DropoutLayer(l_in_Ru, p=sym_dr_Ru, rescale=False, name='Dropout-l_in_Ru') l_in_Rv = ll.DropoutLayer(l_in_Rv, p=sym_dr_Rv, rescale=False, name='Dropout-l_in_Rv') # User encoder model h(Ru) l_enc_u_h = layers.OneHotEncodeLayer(l_in_Ru, num_units=D_u, rank=J_u, num_hots=N_stars, share_params=share_params, nonlinearity=None, name='Dense-l_enc_u_h') l_enc_u_h = ll.NonlinearityLayer(l_enc_u_h, nonlinearity=nonlin_enc, name='Nonlin-l_enc_u_h') # Item encoder model h(Rv) l_enc_v_h = layers.OneHotEncodeLayer(l_in_Rv, num_units=D_v, rank=J_v, num_hots=N_stars, share_params=share_params, nonlinearity=None, name='Dense-l_enc_v_h') l_enc_v_h = ll.NonlinearityLayer(l_enc_v_h, nonlinearity=nonlin_enc, name='Nonlin-l_enc_v_h') # User decoder model s(h(Ru)) l_dec_u_s = layers.OneHotDecodeLayer( [l_enc_u_h, l_in_vid_origin, l_in_uid_minibatch], num_units=N_v, rank=J_u, num_hots=N_stars, share_params=share_params, nonlinearity=None, name='Dense-l_dec_u_s') # Item decoder model s(h(Rv)) l_dec_v_s = layers.OneHotDecodeLayer( [l_enc_v_h, l_in_uid_origin, l_in_vid_minibatch], num_units=N_u, rank=J_v, num_hots=N_stars, share_params=share_params, nonlinearity=None, name='Dense-l_dec_v_s') # Likelihood model p(R) l_uv_s = ll.ElemwiseSumLayer([l_dec_u_s, l_dec_v_s], name='l_uv_s') l_r = ll.NonlinearityLayer(l_uv_s, nonlinearity=ln.softmax, name='l_r') l_r_ordinal = ll.NonlinearityLayer(l_uv_s, nonlinearity=layers.log_ordinal_softmax, name='l_r_ordinal') #---------------------------------------------------------------- # Likelihood and RMSE # training p_r_train, log_p_r_ordinal_train = ll.get_output([l_r, l_r_ordinal], deterministic=False) log_p_r = T.mean( parmesan.distributions.log_multinomial(sym_R_minibatch - 1, p_r_train)) R_minibatch_one_hot = lasagne.utils.one_hot(sym_R_minibatch, m=N_stars + 1)[:, 1:] log_p_r_ordinal = T.mean( T.sum(log_p_r_ordinal_train * R_minibatch_one_hot, axis=1)) regularization = lasagne.regularization.regularize_network_params( [l_r], lasagne.regularization.l2) cost_function = -( 1.0 - alpha ) * log_p_r - alpha * log_p_r_ordinal + weight_decay * regularization predicts_train = T.sum(p_r_train * T.shape_padleft(T.arange(1, 1 + N_stars)), axis=1) SE_train = T.sum(T.sqr(T.cast(sym_R_minibatch, floatX) - predicts_train)) # test sym_enc_u_h = T.fmatrix('enc_u_h') sym_enc_v_h = T.fmatrix('enc_v_h') enc_u_h_out, enc_v_h_out = ll.get_output([l_enc_u_h, l_enc_v_h], deterministic=True) p_r_test, = ll.get_output([l_r], inputs={ l_enc_u_h: sym_enc_u_h, l_enc_v_h: sym_enc_v_h }, deterministic=True) predicts_test = T.sum(p_r_test * T.shape_padleft(T.arange(1, 1 + N_stars)), axis=1) SE_test = T.sum(T.sqr(T.cast(sym_R_minibatch, floatX) - predicts_test)) #---------------------------------------------------------------- # Gradients clip_grad = 1 max_norm = 5 params = ll.get_all_params([ l_r, ], trainable=True) for p in params: logger.debug("%s: %s" % (p, p.get_value().shape)) grads = T.grad(cost_function, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.adam(cgrads, params, beta1=0.9, beta2=0.999, epsilon=1e-4, learning_rate=sym_lr) #---------------------------------------------------------------- # Compile # training function logger.info("Compiling train_model ...") train_model = theano.function( inputs=[ sym_lr, sym_uid_origin, sym_uid_minibatch, sym_vid_origin, sym_vid_minibatch, sym_R_minibatch, sym_Ru, sym_Rv, sym_dr_Ru, sym_dr_Rv ], outputs=[log_p_r, SE_train], updates=updates, ) # encoders logger.info("Compiling encode_model ...") u_encode_model = theano.function(inputs=[sym_Ru], outputs=enc_u_h_out) v_encode_model = theano.function(inputs=[sym_Rv], outputs=enc_v_h_out) # test function logger.info("Compiling test_model ...") test_model = theano.function( inputs=[ sym_uid_origin, sym_uid_minibatch, sym_vid_origin, sym_vid_minibatch, sym_R_minibatch, sym_enc_u_h, sym_enc_v_h ], outputs=[SE_test], ) #---------------------------------------------------------------- # Predict function def predict(which_set='test'): assert which_set in ['valid', 'test'] if which_set == 'valid': R_matrix_cond = R_matrix['train'] else: R_matrix_cond = R_matrix['train'] + R_matrix['valid'] # test statistics SE_epoch = 0 n_pred_epoch = 0 # preconpute hidden representation u_end = 0 while u_end < N_u: u_start, u_end = u_end, min(u_end + batch_size_u, N_u) # create user mini-batch u_batch_ids = np.arange(u_start, u_end).astype('int32') # create conditionals Ru_minibatch = R_matrix_cond[u_batch_ids, :] # encode np_enc_u_h[u_batch_ids] = u_encode_model(Ru_minibatch) v_end = 0 while v_end < N_v: v_start, v_end = v_end, min(v_end + batch_size_v, N_v) # create item mini-batch v_batch_ids = np.arange(v_start, v_end).astype('int32') # create conditionals Rv_minibatch = R_matrix_cond[:, v_batch_ids].T # encode np_enc_v_h[v_batch_ids] = v_encode_model(Rv_minibatch) # loop mini-batches u_end = 0 while u_end < N_u: u_start, u_end = u_end, min(u_end + batch_size_u, N_u) v_end = 0 while v_end < N_v: v_start, v_end = v_end, min(v_end + batch_size_v, N_v) # create user mini-batch and item mini-batch u_batch_ids = np.arange(u_start, u_end).astype('int32') v_batch_ids = np.arange(v_start, v_end).astype('int32') # get encoded vectors Ru_encoded = np_enc_u_h[u_batch_ids, :] Rv_encoded = np_enc_v_h[v_batch_ids, :] # create test samples mini-batch R_matrix_minibatch = R_matrix[which_set][np.ix_( u_batch_ids, v_batch_ids)] R_matrix_minibatch_sparse = scipy.sparse.coo_matrix( R_matrix_minibatch) # prepare user and item IDs needed uid_minibatch = R_matrix_minibatch_sparse.row vid_minibatch = R_matrix_minibatch_sparse.col R_minibatch = R_matrix_minibatch_sparse.data n_pred_step = R_minibatch.shape[0] if n_pred_step == 0: continue uid_origin = u_batch_ids[uid_minibatch] vid_origin = v_batch_ids[vid_minibatch] SE_step, = test_model(uid_origin, uid_minibatch, vid_origin, vid_minibatch, R_minibatch, Ru_encoded, Rv_encoded) SE_epoch += SE_step n_pred_epoch += n_pred_step # print info after test finished assert n_pred_epoch == N_rating[which_set] RMSE_epoch = np.sqrt(SE_epoch / n_pred_epoch) / (N_stars / 5.0) logger.critical("Estimated %s RMSE = %f (%d %s ratings)" % (which_set, RMSE_epoch, n_pred_epoch, which_set)) return RMSE_epoch #---------------------------------------------------------------- # Training best_valid_result = np.inf best_model = None n_epocs_without_improvement = 0 logger.warning("Training started.") # loop epoch for epoch in range(1, 1 + max_epoch): epoch_start_time = time.time() # training statistics LL_epoch_train, SE_epoch_train = 0, 0 n_pred_epoch_train = 0 # loop mini-batches for step in range(n_step): # sample i and j #i = np.random.randint(N_u) #j = np.random.randint(N_v) threshold_u = np.int(0.2 * N_u) threshold_v = np.int(0.2 * N_v) i = np.random.randint(low=threshold_u, high=N_u - min(threshold_u, batch_size_u)) j = np.random.randint(low=threshold_v, high=N_v - min(threshold_v, batch_size_v)) # calculate mini-batch size Bi = min(batch_size_u, N_u - i) Bj = min(batch_size_v, N_v - j) # sample user mini-batch and item mini-batch u_batch_ids_train = np.random.choice(N_u, Bi, replace=False).astype('int32') v_batch_ids_train = np.random.choice(N_v, Bj, replace=False).astype('int32') # create conditionals Ru_minibatch_train = R_matrix['train'][u_batch_ids_train, :] Rv_minibatch_train = R_matrix['train'][:, v_batch_ids_train].T Ru_minibatch_train[:, v_batch_ids_train] = 0 Rv_minibatch_train[:, u_batch_ids_train] = 0 # calculate dropout rate dr_Ru = 1.0 - 1.0 * j / (N_v - Bj) dr_Rv = 1.0 - 1.0 * i / (N_u - Bi) # create training samples mini-batch R_matrix_minibatch_train = R_matrix['train'][np.ix_( u_batch_ids_train, v_batch_ids_train)] R_matrix_minibatch_sparse_train = scipy.sparse.coo_matrix( R_matrix_minibatch_train) # prepare user and item IDs needed uid_minibatch_train = R_matrix_minibatch_sparse_train.row vid_minibatch_train = R_matrix_minibatch_sparse_train.col R_minibatch_train = R_matrix_minibatch_sparse_train.data n_pred_step_train = R_minibatch_train.shape[0] if n_pred_step_train == 0: logger.warning( 'no training samples in current mini-batch.(i=%d, j=%d)' % (i, j)) continue uid_origin_train = u_batch_ids_train[uid_minibatch_train] vid_origin_train = v_batch_ids_train[vid_minibatch_train] # update parameters and calculate likelihood and RMSE LL_step_train, SE_step_train = train_model( lr, uid_origin_train, uid_minibatch_train, vid_origin_train, vid_minibatch_train, R_minibatch_train, Ru_minibatch_train, Rv_minibatch_train, dr_Ru, dr_Rv) LL_epoch_train += LL_step_train * n_pred_step_train SE_epoch_train += SE_step_train n_pred_epoch_train += n_pred_step_train # print info after epoch finished LL_epoch_train /= n_pred_epoch_train RMSE_epoch_train = np.sqrt( SE_epoch_train / n_pred_epoch_train) / (N_stars / 5.0) epoch_end_time = time.time() logger.info( "Epoch %d, Estimated training RMSE = %f, LL = %f (%d training ratings). Elapsed time %fs." % (epoch, RMSE_epoch_train, LL_epoch_train, n_pred_epoch_train, epoch_end_time - epoch_start_time)) # validation RMSE_valid = predict('valid') # termination if RMSE_valid < best_valid_result: n_epocs_without_improvement = 0 best_valid_result = RMSE_valid best_model = ll.get_all_param_values([ l_r, ], trainable=True) logger.debug("New best model found!") else: n_epocs_without_improvement += 1 if n_epocs_without_improvement >= lookahead: ll.set_all_param_values([ l_r, ], best_model, trainable=True) if lr > 1e-5: n_epocs_without_improvement = 0 lr /= 4 logger.warning("Learning rate = %f now." % lr) else: logger.warning("Training finished.") break #---------------------------------------------------------------- # Test RMSE_test = predict('test') #---------------------------------------------------------------- # Summarization for k, v in six.iteritems(args_dict): logger.info(" %20s: %s" % (k, v))
def build_network_from_ae(classn): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 120, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 240, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 640, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 1024, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 640, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 100, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) maskm = batch_norm( layers.Conv2DLayer(prely, 100, filter_size=(1, 1), nonlinearity=leaky_rectify)) mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1, 1), nonlinearity=None), beta=None, gamma=None) mask_map = SoftThresPerc(mask_rep, perc=97.0, alpha=0.1, beta=init.Constant(0.5), tight=100.0, name="mask_map") enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder") layer = batch_norm( layers.Deconv2DLayer(enlyr, 1024, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 240, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 120, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 128, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(5, 5), stride=5, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) gllyr = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(gllyr, 256, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(9, 9), stride=5, crop=(2, 2), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) mask_map.beta.set_value(np.float32(0.9 * mask_map.beta.get_value())) old_params = layers.get_all_params(network, trainable=True) # Adding more layers aug_var = T.matrix('aug_var') target_var = T.imatrix('targets') add_a = batch_norm( layers.Conv2DLayer(enlyr, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_b = batch_norm( layers.Conv2DLayer(add_a, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_c = batch_norm( layers.Conv2DLayer(add_b, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_d = batch_norm( layers.Conv2DLayer(add_c, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_0 = layers.Pool2DLayer(add_d, pool_size=(25, 25), stride=25, mode='average_inc_pad') add_1 = batch_norm( layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify)) add_2 = batch_norm( layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify)) add_3 = batch_norm( layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify)) add_4 = batch_norm( layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify)) aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var) cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1) hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify) network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid) all_params = layers.get_all_params(network, trainable=True) new_params = [x for x in all_params if x not in old_params] return network, new_params, input_var, aug_var, target_var
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None,None,1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None,None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None,None,1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None,None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None,None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None,None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None,MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None,MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None,None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer(l_docembed, (doc_shp[0],doc_shp[1],self.embed_dim)) # B x N x DE l_qemb = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer(l_qemb, (qry_shp[0],qry_shp[1],self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb==0: l_docembed.params[l_docembed.W].remove('trainable') l_qemb.params[l_qemb.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) attentions = [] if self.save_attn: l_m = PairwiseInteractionLayer([l_doce,l_qembed]) attentions.append(L.get_output(l_m, deterministic=True)) for i in range(K-1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1]) l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], gating_fn=self.gating_fn, mask_input=self.inps[7]) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.save_attn: attentions.append(L.get_output(l_m, deterministic=True)) if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 # final layer l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2) # B x Q x 2D if self.save_attn: l_m = PairwiseInteractionLayer([l_doc, l_q]) attentions.append(L.get_output(l_m, deterministic=True)) l_prob = AttentionSumLayer([l_doc,l_q], self.inps[4], self.inps[12], mask_input=self.inps[10]) final = L.get_output(l_prob) final_v = L.get_output(l_prob, deterministic=True) return final, final_v, l_prob, l_docembed.W, attentions