def get_context(self, conv_in, avg=False): suf = '_avg' if avg else '' conv_out = [] # for n in [2,3,4,5,6,7,8,9]: # for n in [2,3,4,5]: for n in self.args.context_ngrams: conv = conv_in for i in range(self.args.conv_layers): conv = L.Conv1DLayer( conv, 128, n, name='conv_window_%d(%d)%s' % (n, i, suf), # W=HeNormal('relu') if not avg else Constant()) # (100, 128, 15-n+1) W=GlorotNormal('relu') if not avg else Constant()) # (100, 128, 15-n+1) conv = L.MaxPool1DLayer( conv, self.args.window_size - (n - 1) * self.args.conv_layers) # (100, 128, 1) conv = L.flatten(conv, 2) # (100, 128) conv_out.append(conv) x = L.concat(conv_out, axis=1) # (100, 1024) return x
def _buildConv(self): layer = layers.InputLayer(shape=(None, 3, 32, 32), input_var=self.X) layer = layers.DropoutLayer(layer, p=0.2) layer = maxoutConv(layer, num_filters=32 * 5, ds=5, filter_size=(5, 5), stride=(1, 1), pad='same') layer = layers.DropoutLayer(layer, p=0.5) layer = maxoutConv(layer, num_filters=32 * 5, ds=5, filter_size=(5, 5), stride=(1, 1), pad='same') layer = layers.flatten(layer, outdim=2) # 不加入展开层也可以,DenseLayer自动展开 layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=256, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=10, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.softmax) return layer
def build_triamese_inception(inputlist, imgh=50, imgw=50): """ 'triamese' (one branch for each view, feeding a fully-connected network), model using a slightly modified set of Google inception modules """ input_var_x, input_var_u, input_var_v = \ inputlist[0], inputlist[1], inputlist[2] net = {} # Input layer tshape = (None, 1, imgw, imgh) net['input_x'] = InputLayer(shape=tshape, input_var=input_var_x) net['input_u'] = InputLayer(shape=tshape, input_var=input_var_u) net['input_v'] = InputLayer(shape=tshape, input_var=input_var_v) # nfilters: (pool_proj, 1x1, 3x3_reduce, 3x3, 5x5_reduce, 5x5) nfilters = [32, 64, 96, 128, 16, 32] net.update(build_inception_module('inc_x1', net['input_x'], nfilters)) net.update(build_inception_module('inc_u1', net['input_u'], nfilters)) net.update(build_inception_module('inc_v1', net['input_v'], nfilters)) net['dense_x'] = DenseLayer( dropout(flatten(net['inc_x1/output']), p=.5), num_units=100, nonlinearity=lasagne.nonlinearities.rectify) net['dense_u'] = DenseLayer( dropout(flatten(net['inc_u1/output']), p=.5), num_units=100, nonlinearity=lasagne.nonlinearities.rectify) net['dense_v'] = DenseLayer( dropout(flatten(net['inc_v1/output']), p=.5), num_units=100, nonlinearity=lasagne.nonlinearities.rectify) # Concatenate the parallel inputs net['concat'] = ConcatLayer((net['dense_x'], net['dense_u'], net['dense_v'])) # And, finally, the 11-unit output layer with 50% dropout on its inputs: net['output_prob'] = DenseLayer( dropout(net['concat'], p=.5), num_units=11, nonlinearity=lasagne.nonlinearities.softmax) logger.info("n-parameters: {}".format( lasagne.layers.count_params(net['output_prob'])) ) return net['output_prob']
def build_triamese_inception(inputlist, imgh=50, imgw=50): """ 'triamese' (one branch for each view, feeding a fully-connected network), model using a slightly modified set of Google inception modules """ input_var_x, input_var_u, input_var_v = \ inputlist[0], inputlist[1], inputlist[2] net = {} # Input layer tshape = (None, 1, imgw, imgh) net['input_x'] = InputLayer(shape=tshape, input_var=input_var_x) net['input_u'] = InputLayer(shape=tshape, input_var=input_var_u) net['input_v'] = InputLayer(shape=tshape, input_var=input_var_v) # nfilters: (pool_proj, 1x1, 3x3_reduce, 3x3, 5x5_reduce, 5x5) nfilters = [32, 64, 96, 128, 16, 32] net.update(build_inception_module('inc_x1', net['input_x'], nfilters)) net.update(build_inception_module('inc_u1', net['input_u'], nfilters)) net.update(build_inception_module('inc_v1', net['input_v'], nfilters)) net['dense_x'] = DenseLayer( dropout(flatten(net['inc_x1/output']), p=.5), num_units=100, nonlinearity=lasagne.nonlinearities.rectify) net['dense_u'] = DenseLayer( dropout(flatten(net['inc_u1/output']), p=.5), num_units=100, nonlinearity=lasagne.nonlinearities.rectify) net['dense_v'] = DenseLayer( dropout(flatten(net['inc_v1/output']), p=.5), num_units=100, nonlinearity=lasagne.nonlinearities.rectify) # Concatenate the parallel inputs net['concat'] = ConcatLayer((net['dense_x'], net['dense_u'], net['dense_v'])) # And, finally, the 11-unit output layer with 50% dropout on its inputs: net['output_prob'] = DenseLayer( dropout(net['concat'], p=.5), num_units=11, nonlinearity=lasagne.nonlinearities.softmax) print("n-parameters: ", lasagne.layers.count_params(net['output_prob'])) return net['output_prob']
def build_cnn(input): #data_size = (None,103,130) # Batch size x Img Channels x Height x Width #input_var = T.tensor3(name = "input",dtype='int64') input_var = input #values = np.array(np.random.randint(0,102,(1,9,50))) #input_var.tag.test_value = values #number sentences x words x characters input_layer = L.InputLayer((None,9,50), input_var=input) W = create_char_embedding_matrix() embed_layer = L.EmbeddingLayer(input_layer, input_size=103,output_size=101, W=W) #print "EMBED", L.get_output(embed_layer).tag.test_value.shape reshape_embed = L.reshape(embed_layer,(-1,50,101)) #print "reshap embed", L.get_output(reshape_embed).tag.test_value.shape conv_layer_1 = L.Conv1DLayer(reshape_embed, 55, 2) conv_layer_2 = L.Conv1DLayer(reshape_embed, 55, 3) #print "TEST" #print "Convolution Layer 1", L.get_output(conv_layer_1).tag.test_value.shape #print "Convolution Layer 2", L.get_output(conv_layer_2).tag.test_value.shape #flatten_conv_1 = L.flatten(conv_layer_1,3) #flatten_conv_2 = L.flatten(conv_layer_2,3) #reshape_max_1 = L.reshape(flatten_conv_1,(-1,49)) #reshape_max_2 = L.reshape(flatten_conv_2, (-1,48)) #print "OUTPUT Flatten1", L.get_output(flatten_conv_1).tag.test_value.shape #print "OUTPUT Flatten2", L.get_output(flatten_conv_2).tag.test_value.shape #print "OUTPUT reshape_max_1", L.get_output(reshape_max_1).tag.test_value.shape #print "OUTPUT reshape_max_2", L.get_output(reshape_max_2).tag.test_value.shape pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=54) pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=53) #print "OUTPUT POOL1", L.get_output(pool_layer_1).tag.test_value.shape #print "OUTPUT POOL2",L.get_output(pool_layer_2).tag.test_value.shape merge_layer = L.ConcatLayer([pool_layer_1, pool_layer_2], 1) flatten_merge = L.flatten(merge_layer, 2) reshape_merge = L.reshape(flatten_merge, (1,9,110)) print L.get_output(reshape_embed).shape #print L.get_output(reshape_merge).tag.test_value.shape return reshape_merge, char_index_lookup
def get_conv_input(self, sidx, tidx, avg=False): suf = '_avg' if avg else '' feat_embs = [ self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg) for name in self.args.source_feats ] # TODO: change the meaning if self.args.lex == 'mix': concat_emb = L.ElemwiseSumLayer(feat_embs) # (100, 15, 256) else: concat_emb = L.concat(feat_embs, axis=2) # (100, 15, 256+100) pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] * (self.args.window_size / 2)).astype( theano.config.floatX) post = theano.shared(pos[np.newaxis, :, np.newaxis], borrow=True) # (1, 15, 1) posl = L.InputLayer( (None, self.args.window_size, 1), input_var=T.extra_ops.repeat(post, sidx.shape[0], axis=0)) # (100, 15, 1) conv_in = L.concat([concat_emb, posl], axis=2) # (100, 15, 256+1) if self.args.pos_emb: posint = L.flatten( L.ExpressionLayer(posl, lambda x: T.cast(x, 'int64'))) # (100, 15) pos_emb = L.EmbeddingLayer( posint, self.args.window_size, 8, name='epos' + suf, W=Normal(0.01) if not avg else Constant()) # (100, 15, 8) pos_emb.params[pos_emb.W].remove('regularizable') conv_in = L.concat([concat_emb, posl, pos_emb], axis=2) # (100, 15, 256+1+8) # # squeeze # if self.args.squeeze: # conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2, # W=HeNormal('relu')) # (100, 15, 256) conv_in = L.dimshuffle(conv_in, (0, 2, 1)) # (100, 256+1, 15) return conv_in
def _build(X): layer = layers.InputLayer(shape=(None, 1, 28, 28), input_var=X) layer = layers.Conv2DLayer(layer, num_filters=32, filter_size=(5, 5), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) visual1 = layers.get_output(layer) layer = layers.MaxPool2DLayer(layer, pool_size=(2, 2), stride=None, pad=(0, 0), ignore_border=False) layer = layers.Conv2DLayer(layer, num_filters=32, filter_size=(5, 5), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) visual2 = layers.get_output(layer) layer = layers.MaxPool2DLayer(layer, pool_size=(2, 2), stride=None, pad=(0, 0), ignore_border=False) layer = layers.flatten(layer, outdim=2) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=256, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=10, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.softmax) return layer, visual1, visual2
def layer_context(layer_ctx, ctx_nblayers, ctx_nbfilters, ctx_winlen, hiddensize, nonlinearity, bn_axes=None, bn_cnn_axes=None, critic=False, useLRN=True): layer_ctx = ll.dimshuffle(layer_ctx, [0, 'x', 1, 2], name='ctx.dimshuffle_to_2DCNN') for layi in xrange(ctx_nblayers): layerstr = 'ctx.l' + str(1 + layi) + '_CNN{}x{}x{}'.format( ctx_nbfilters, ctx_winlen, 1) layer_ctx = ll.Conv2DLayer(layer_ctx, num_filters=ctx_nbfilters, filter_size=[ctx_winlen, 1], stride=1, pad='same', nonlinearity=nonlinearity, name=layerstr) if not critic and (not bn_cnn_axes is None): layer_ctx = ll.batch_norm(layer_ctx, axes=bn_cnn_axes) # layer_ctx = ll.batch_norm(layer_GatedConv2DLayer(layer_ctx, ctx_nbfilters, [ctx_winlen,1], stride=1, pad='same', nonlinearity=nonlinearity, name=layerstr)) if critic and useLRN: layer_ctx = ll.LocalResponseNormalization2DLayer(layer_ctx) layer_ctx = ll.dimshuffle(layer_ctx, [0, 2, 3, 1], name='ctx.dimshuffle_back') layer_ctx = ll.flatten(layer_ctx, outdim=3, name='ctx.flatten') for layi in xrange(2): layerstr = 'ctx.l' + str(1 + ctx_nblayers + layi) + '_FC{}'.format(hiddensize) layer_ctx = ll.DenseLayer(layer_ctx, hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) if not critic and (not bn_axes is None): layer_ctx = ll.batch_norm(layer_ctx, axes=bn_axes) return layer_ctx
def build_model(input_var): layer = layers.InputLayer(shape=(None, 3, 224, 224), input_var=input_var) layer = layers.Conv2DLayer(layer, num_filters=64, filter_size=(3, 3), stride=(1, 1), pad='same') layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=(2, 2), pad=(0, 0), ignore_border=False) layer = layers.Conv2DLayer(layer, num_filters=128, filter_size=(3, 3), stride=(1, 1), pad='same') layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=(2, 2), pad=(0, 0), ignore_border=False) layer = layers.Conv2DLayer(layer, num_filters=256, filter_size=(3, 3), stride=(1, 1), pad='same') layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=(2, 2), pad=(0, 0), ignore_border=False) layer = layers.Conv2DLayer(layer, num_filters=512, filter_size=(3, 3), stride=(1, 1), pad='same') layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=(2, 2), pad=(0, 0), ignore_border=False) layer = layers.Conv2DLayer(layer, num_filters=512, filter_size=(3, 3), stride=(1, 1), pad='same') layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=(2, 2), pad=(0, 0), ignore_border=False) layer = layers.flatten(layer, outdim=2) layer = layers.DenseLayer(layer, num_units=4096, nonlinearity=nonlinearities.rectify) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=4096, nonlinearity=nonlinearities.rectify) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=2, nonlinearity=nonlinearities.softmax) return layer
def _build(self): layer = layers.InputLayer(shape=(None, 3, 32, 32), input_var=self.X) layer = nin(layer, conv_filters=192, filter_size=(5, 5), pad=2, cccp1_filters=160, cccp2_filters=96) layer = layers.Pool2DLayer(layer, pool_size=(3, 3), stride=2, pad=(0, 0), ignore_border=False, mode='max') layer = layers.DropoutLayer(layer, p=0.5) layer = nin(layer, conv_filters=192, filter_size=(5, 5), pad=2, cccp1_filters=192, cccp2_filters=192) layer = layers.Pool2DLayer(layer, pool_size=(3, 3), stride=2, ignore_border=False, mode='average_exc_pad') layer = layers.DropoutLayer(layer, p=0.5) layer = nin(layer, conv_filters=192, filter_size=(3, 3), pad=1, cccp1_filters=192, cccp2_filters=10) layer = layers.Pool2DLayer(layer, pool_size=(8, 8), stride=1, ignore_border=False, mode='average_exc_pad') layer = layers.flatten(layer, outdim=2) layer = layers.NonlinearityLayer(layer, nonlinearity=nonlinearities.softmax) return layer
def _build(self): layer = layers.InputLayer(shape=(None, 3, 112, 112), input_var=self.X) layer = layers.Conv2DLayer(layer, num_filters=64, filter_size=(5, 5), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.MaxPool2DLayer(layer, pool_size=(2, 2), stride=None, pad=(0, 0), ignore_border=False) layer = layers.Conv2DLayer(layer, num_filters=64, filter_size=(5, 5), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.MaxPool2DLayer(layer, pool_size=(8, 8), stride=None, pad=(0, 0), ignore_border=False) layer = layers.flatten(layer, outdim=2) # 不加入展开层也可以,DenseLayer自动展开 layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=2048, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=2, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.softmax) return layer
def _create_network(available_actions_num, input_shape, visual_input_var, n_variables, variables_input_var): dqn = InputLayer(shape=[None, input_shape.frames, input_shape.y, input_shape.x], input_var=visual_input_var) dqn = Conv2DLayer(dqn, num_filters=32, filter_size=[8, 8], stride=[4, 4], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[4, 4], stride=[2, 2], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[3, 3], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) if n_variables > 0: variables_layer = InputLayer(shape=[None, n_variables], input_var=variables_input_var) dqn = ConcatLayer((flatten(dqn), variables_layer)) dqn = DenseLayer(dqn, num_units=512, nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = DenseLayer(dqn, num_units=available_actions_num, nonlinearity=None) return dqn
def create_nn(): ''' Returns the theano function - train,test Returns the 'KerasNet' Using default values of adam - learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08 Input to the NN is (batch_size,3,32,32) and corresponding classes it belong to (batch_size,) ''' l_in = InputLayer((batch_size,3,32,32)) l_in_bn = BatchNormLayer(l_in) conv1 = Conv2DLayer(l_in_bn,pad='same',num_filters=64,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx64x32x32 conv1_1 = Conv2DLayer(conv1,pad='same',num_filters=64,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx64x32x32 conv1_mp = MaxPool2DLayer(conv1_1,pool_size=(2,2)) #Bx64x16x16 conv1_do = dropout(conv1_mp,p=0.25) conv2 = Conv2DLayer(conv1_do,pad='same',num_filters=128,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx128x16x16 conv2_1 = Conv2DLayer(conv2,pad='same',num_filters=128,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx128x16x16 conv2_mp = MaxPool2DLayer(conv2_1,pool_size=(2,2)) #Bx128x8x8 conv2_do = dropout(conv2_mp,p=0.25) flat = flatten(conv2_do,2) #Bx8192 fc = DenseLayer(flat,num_units=512,nonlinearity=lasagne.nonlinearities.rectify) #Bx512 fc_do = dropout(fc, p=0.5) network = DenseLayer(fc_do, num_units=nb_classes, nonlinearity=lasagne.nonlinearities.softmax ) #Bxnb_classes net_output = lasagne.layers.get_output(network) true_output = T.matrix() all_params = lasagne.layers.get_all_params(network,trainable=True) loss = T.mean(lasagne.objectives.categorical_crossentropy(net_output,true_output)) updates = lasagne.updates.adam(loss,all_params) train = theano.function(inputs= [l_in.input_var,true_output] , outputs=[net_output,loss], updates = updates) test = theano.function(inputs= [l_in.input_var], outputs= [net_output]) return train,test,network
def build_network(W, number_unique_tags, longest_word, longest_sentence, input_var=None): print("Building network ...") input_layer = L.InputLayer((None, longest_sentence, longest_word), input_var=input_var) embed_layer = L.EmbeddingLayer(input_layer, input_size=103, output_size=101, W=W) reshape_embed = L.reshape(embed_layer, (-1, longest_word, 101)) conv_layer_1 = L.Conv1DLayer(reshape_embed, longest_word, 2) conv_layer_2 = L.Conv1DLayer(reshape_embed, longest_word, 3) pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=longest_word - 1) pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=longest_word - 2) merge_layer = L.ConcatLayer([pool_layer_1, pool_layer_2], 1) flatten_merge = L.flatten(merge_layer, 2) reshape_merge = L.reshape(flatten_merge, (-1, longest_sentence, int(longest_word * 2))) l_re = lasagne.layers.RecurrentLayer( reshape_merge, N_HIDDEN, nonlinearity=lasagne.nonlinearities.sigmoid, mask_input=None) l_out = lasagne.layers.DenseLayer( l_re, number_unique_tags, nonlinearity=lasagne.nonlinearities.softmax) print "DONE BUILDING NETWORK" return l_out
def _build(self): layer = layers.InputLayer(shape=(None, 1, 28, 28), input_var=self.X) layer = layers.Conv2DLayer(layer, num_filters=128, filter_size=(1, 1), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.Conv2DLayer(layer, num_filters=128, filter_size=(1, 1), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=None, pad=(0, 0), ignore_border=False, mode='average_exc_pad') layer = layers.Conv2DLayer(layer, num_filters=512, filter_size=(1, 1), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.Conv2DLayer(layer, num_filters=512, filter_size=(1, 1), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=None, pad=(0, 0), ignore_border=False, mode='average_exc_pad') layer = layers.Conv2DLayer(layer, num_filters=2048, filter_size=(1, 1), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.Conv2DLayer(layer, num_filters=2048, filter_size=(1, 1), stride=(1, 1), pad='same', untie_biases=False, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=None, pad=(0, 0), ignore_border=False, mode='max') layer = layers.flatten(layer, outdim=2) # 不加入展开层也可以,DenseLayer自动展开 layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=256, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify) layer = layers.DropoutLayer(layer, p=0.5) layer = layers.DenseLayer(layer, num_units=10, W=init.GlorotUniform(), b=init.Constant(0.), nonlinearity=nonlinearities.softmax) return layer
drop = dropout(pool,0.2) return drop l_in = InputLayer(shape=(filter_size=(3,3)2, num_filters=32)) inputNorm = BatchNormLayer(l_in) input_drop = dropout(inputNorm,0.2) ## The network has 3 sets of conv and maxout networks. set1 = get_multiple_block(input_drop,num_filt=32,k=3,justheconv=1) set2 = get_multiple_block(set1,num_filt=48,k=2) set3 = get_multiple_block(set2,num_filt=80) set4 = get_multiple_block(set3,num_filt=128, pooling_size=(8,8)) # Dense Layers follow. h_flat = flatten(set4) ## 5 Way Max-Out Layer (DenseMaxout) ''' Reference - https://github.com/fchollet/keras/pull/3128 ''' h_dense = [] for _ in xrange(5): h_dense.append( DenseLayer(h_flat,500,W = lasagne.init.GlorotUniform(), nonlinearity = lasagne.nonlinearities.linear)) h17 = ElemwiseMergeLayer( h_dense, merge_function=T.maximum()) h17 = BatchNormLayer(h17) h17_drop = dropout(h17,0.2)
def __init__(self, gate_controllers, channels, gate_nonlinearities=nonlinearities.sigmoid, bias_init=init.Constant(), weight_init=init.Normal(), **kwargs): """ An overly generic interface for one-step gate, stacked gates or gate applier. If several channels are given, stacks them for quicker execution. gate_controllers - a single layer or a list/tuple of such layers that gate depends on (for most RNNs, that's input and previous memory state) channels - a single layer or integer or a list/tuple of layers/integers if a layer, that defines a layer that should be multiplied by the gate output if an integer - that defines a number of units of a gate -- and these are the units to be returned gate_nonlinearities - a single function or a list of such(channel-wise), - defining nonlinearities for gates on corresponding channels bias_init - an initializer or a list (channel-wise) of initializers for bias(b) parameters - (None, lasagne.init, theano variable or numpy array) - None means no bias weight init - an initializer OR a list of initializers for (channel-wise) - OR a list of lists of initializers (channel, controller) - (lasagne.init, theano variable or numpy array) """ self.channels = check_list(channels) self.gate_controllers = check_list(gate_controllers) # check channel types for chl in self.channels: assert is_layer(chl) or (type(chl) == int) # separate layers from non-layers self.channel_layers = list(filter(is_layer, self.channels)) self.channel_ints = [v for v in self.channels if not is_layer(v)] # flatten layers to 2 dimensions for i in range(len(self.channel_layers)): layer = self.channel_layers[i] if type(layer) == int: continue lname = layer.name or "" if len(layer.output_shape) != 2: warn("One of the channels (name='%s') has an input dimension of %s and will be flattened." % ( lname, layer.output_shape)) self.channel_layers[i] = flatten(layer, outdim=2, name=lname) assert len(self.channel_layers[i].output_shape) == 2 # flatten layers to 2 dimensions for i in range(len(self.gate_controllers)): layer = self.gate_controllers[i] lname = layer.name or "" if len(layer.output_shape) != 2: warn("One of the gate controllers (name='%s') has an input dimension of %s and will be flattened." % ( lname, layer.output_shape)) self.gate_controllers[i] = flatten(layer, outdim=2, name=lname) assert len(self.gate_controllers[i].output_shape) == 2 # initialize merge layer incomings = self.channel_layers + self.gate_controllers # default name kwargs["name"] = kwargs.get("name", "YetAnother" + self.__class__.__name__) output_names = ["%s.channel.%i"%(kwargs["name"],i) for i in range(len(self.channels))] # determine whether or not user defined a fixed batch size batch_sizes = [chl.output_shape[0] for chl in filter(is_layer, self.channels)] batch_size = reduce(lambda a,b: a or b, batch_sizes,None) output_shapes = [ chl.output_shape if is_layer(chl) else (batch_size,chl) for chl in self.channels] output_shapes = OrderedDict(zip(output_names,output_shapes)) output_dtypes = [ get_layer_dtype(chl) for chl in self.channels] output_dtypes = OrderedDict(zip(output_names,output_dtypes)) super(GateLayer, self).__init__(incomings, output_shapes=output_shapes, output_dtypes=output_dtypes, **kwargs) # nonlinearities self.gate_nonlinearities = check_list(gate_nonlinearities) self.gate_nonlinearities = [(nl if (nl is not None) else (lambda v: v)) for nl in self.gate_nonlinearities] # must be either one common nonlinearity or one per channel assert len(self.gate_nonlinearities) in (1, len(self.channels)) if len(self.gate_nonlinearities) == 1: self.gate_nonlinearities *= len(self.channels) # cast bias init to a list bias_init = check_list(bias_init) assert len(bias_init) in (1, len(self.channels)) if len(bias_init) == 1: bias_init *= len(self.channels) # cast weight init to a list of lists [channel][controller] weight_init = check_list(weight_init) assert len(weight_init) in (1, len(self.channels)) if len(weight_init) == 1: weight_init *= len(self.channels) for i in range(len(self.channels)): weight_init[i] = check_list(weight_init[i]) assert len(weight_init[i]) in (1, len(self.gate_controllers)) if len(weight_init[i]) == 1: weight_init[i] *= len(self.gate_controllers) self.gate_b = [] # a list of biases for channels self.gate_W = [list() for _ in self.gate_controllers] # a list of lists of weights [controller][channel] for chl_i, (channel, b_init, channel_w_inits) in enumerate(zip(self.channels, bias_init, weight_init )): if is_layer(channel): channel_name = channel.name or "chl" + str(chl_i) channel_n_units = channel.output_shape[1] else: channel_name = "chl" + str(chl_i) channel_n_units = channel # add bias if b_init is not None: self.gate_b.append( self.add_param( spec=b_init, shape=(channel_n_units,), name="b_%s" % (channel_name) ) ) else: self.gate_b.append(T.zeros((channel_n_units,))) # add weights for ctrl_i, (controller, w_init) in enumerate(zip(self.gate_controllers, channel_w_inits )): ctrl_name = controller.name or "ctrl" + str(ctrl_i) # add bias self.gate_W[ctrl_i].append( self.add_param( spec=w_init, shape=(controller.output_shape[1], channel_n_units), name="W_%s_%s" % (ctrl_name, channel_name) )) # a list where i-th element contains weights[i-th_gate_controller] for all outputs stacked self.gate_W_stacked = [T.concatenate(weights, axis=1) for weights in self.gate_W] # a list of biases for the respective outputs stacked self.gate_b_stacked = T.concatenate(self.gate_b)
def __init__(self, gate_controllers, channels, gate_nonlinearities=nonlinearities.sigmoid, bias_init=init.Constant(), weight_init=init.Normal(), **kwargs): self.channels = check_list(channels) self.gate_controllers = check_list(gate_controllers) # check channel types for chl in self.channels: assert is_layer(chl) or (type(chl) == int) # separate layers from non-layers self.channel_layers = list(filter(is_layer, self.channels)) self.channel_ints = [v for v in self.channels if not is_layer(v)] # flatten layers to 2 dimensions for i in range(len(self.channel_layers)): layer = self.channel_layers[i] if type(layer) == int: continue lname = layer.name or "" if len(layer.output_shape) != 2: warn("One of the channels (name='%s') has an input dimension of %s and will be flattened." % ( lname, layer.output_shape)) self.channel_layers[i] = flatten(layer, outdim=2, name=lname) assert len(self.channel_layers[i].output_shape) == 2 # flatten layers to 2 dimensions for i in range(len(self.gate_controllers)): layer = self.gate_controllers[i] lname = layer.name or "" if len(layer.output_shape) != 2: warn("One of the gate controllers (name='%s') has an input dimension of %s and will be flattened." % ( lname, layer.output_shape)) self.gate_controllers[i] = flatten(layer, outdim=2, name=lname) assert len(self.gate_controllers[i].output_shape) == 2 # initialize merge layer incomings = self.channel_layers + self.gate_controllers # default name kwargs["name"] = kwargs.get("name", "YetAnother" + self.__class__.__name__) output_names = ["%s.channel.%i"%(kwargs["name"],i) for i in range(len(self.channels))] # determine whether or not user defined a fixed batch size batch_sizes = [chl.output_shape[0] for chl in filter(is_layer, self.channels)] batch_size = reduce(lambda a,b: a or b, batch_sizes,None) output_shapes = [ chl.output_shape if is_layer(chl) else (batch_size,chl) for chl in self.channels] output_shapes = OrderedDict(zip(output_names,output_shapes)) output_dtypes = [ get_layer_dtype(chl) for chl in self.channels] output_dtypes = OrderedDict(zip(output_names,output_dtypes)) super(GateLayer, self).__init__(incomings, output_shapes=output_shapes, output_dtypes=output_dtypes, **kwargs) # nonlinearities self.gate_nonlinearities = check_list(gate_nonlinearities) self.gate_nonlinearities = [(nl if (nl is not None) else (lambda v: v)) for nl in self.gate_nonlinearities] # must be either one common nonlinearity or one per channel assert len(self.gate_nonlinearities) in (1, len(self.channels)) if len(self.gate_nonlinearities) == 1: self.gate_nonlinearities *= len(self.channels) # cast bias init to a list bias_init = check_list(bias_init) assert len(bias_init) in (1, len(self.channels)) if len(bias_init) == 1: bias_init *= len(self.channels) # cast weight init to a list of lists [channel][controller] weight_init = check_list(weight_init) assert len(weight_init) in (1, len(self.channels)) if len(weight_init) == 1: weight_init *= len(self.channels) for i in range(len(self.channels)): weight_init[i] = check_list(weight_init[i]) assert len(weight_init[i]) in (1, len(self.gate_controllers)) if len(weight_init[i]) == 1: weight_init[i] *= len(self.gate_controllers) self.gate_b = [] # a list of biases for channels self.gate_W = [list() for _ in self.gate_controllers] # a list of lists of weights [controller][channel] for chl_i, (channel, b_init, channel_w_inits) in enumerate(zip(self.channels, bias_init, weight_init )): if is_layer(channel): channel_name = channel.name or "chl" + str(chl_i) channel_n_units = channel.output_shape[1] else: channel_name = "chl" + str(chl_i) channel_n_units = channel # add bias if b_init is not None: self.gate_b.append( self.add_param( spec=b_init, shape=(channel_n_units,), name="b_%s" % (channel_name) ) ) else: self.gate_b.append(T.zeros((channel_n_units,))) # add weights for ctrl_i, (controller, w_init) in enumerate(zip(self.gate_controllers, channel_w_inits )): ctrl_name = controller.name or "ctrl" + str(ctrl_i) # add bias self.gate_W[ctrl_i].append( self.add_param( spec=w_init, shape=(controller.output_shape[1], channel_n_units), name="W_%s_%s" % (ctrl_name, channel_name) )) # a list where i-th element contains weights[i-th_gate_controller] for all outputs stacked self.gate_W_stacked = [T.concatenate(weights, axis=1) for weights in self.gate_W] # a list of biases for the respective outputs stacked self.gate_b_stacked = T.concatenate(self.gate_b)
def get_char2word(self, ic, avg=False): suf = '_avg' if avg else '' ec = L.EmbeddingLayer( ic, self.args.vc, self.args.nc, name='ec' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 32, 16) ec.params[ec.W].remove('regularizable') if self.args.char_model == 'CNN': lds = L.dimshuffle(ec, (0, 3, 1, 2)) # (100, 16, 24, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.nf, (1, n), untie_biases=True, W=HeNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 24, 32-n+1) lpool = L.MaxPool2DLayer( lconv, (1, self.args.max_len - n + 1)) # (100, 64, 24, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 24) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 24, 16) ls.append(lpool) xc = L.concat(ls, axis=2) # (100, 24, 64) return xc elif self.args.char_model == 'LSTM': ml = L.ExpressionLayer( ic, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_len)) # (2400, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( ec, (-1, self.args.max_len, self.args.nc)) # (2400, 32, 16) lstm_f = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (2400, 64) lstm_b = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (2400, 64) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (2400, 128) xc = L.reshape(xc, (-1, self.args.sw, self.args.nw)) # (100, 24, 256) return xc
def __init__(self, emb_dim, rnn_dim, hid_dim, vocab_size, context, cell='lstm', add_dense=True, dropout_p=0.2, depth=1, prod=False, **cell_args): self.emb_dim = emb_dim self.rnn_dim = rnn_dim self.hid_dim = hid_dim self.vocab_size = vocab_size self.context = context self.cell = cell self.add_dense = add_dense self.depth = depth self.cell_args = cell_args self.prod = prod # Input is integer matrices (batch_size, seq_length) input_layer = InputLayer(shape=(None, context * 2), input_var=T.imatrix()) self.emb_W = np.random.uniform(size=(vocab_size, emb_dim), low=-0.05, high=0.05).astype(np.float32) emb = EmbeddingLayer(input_layer, input_size=vocab_size, output_size=emb_dim, W=self.emb_W) batch_size, _ = input_layer.input_var.shape rnn_shape = (batch_size, context * 2, rnn_dim) rnn = bid_layer( emb, rnn_dim, batch_size, rnn_shape, cell=cell, add_dense=add_dense, dropout_p=dropout_p, depth=depth, **cell_args) # time distributed dense output_shape = (batch_size, context * 2, hid_dim) rnn = ReshapeLayer(rnn, (-1, rnn_dim)) rnn = DenseLayer(rnn, num_units=hid_dim) rnn = ReshapeLayer(dropout(rnn, p=dropout_p), output_shape) # flatten rnn = flatten(rnn) self.output = DenseLayer( rnn, num_units=vocab_size, nonlinearity=softmax) # Don't compile train and test functions in production mode if not prod: # T.nnet.categorical_crossentropy allows to represent true dist # as an integer vector (implicitely casting to a one-hot matrix) lr, targets = T.fscalar('lr'), T.ivector('targets') pred = get_output(self.output) loss = T.nnet.categorical_crossentropy(pred, targets).mean() params = get_all_params(self.output, trainable=True) updates = lasagne.updates.rmsprop(loss, params, lr) print("Compiling training function") self._train = theano.function( [input_layer.input_var, targets, lr], loss, updates=updates, allow_input_downcast=True) test_pred = get_output(self.output, deterministic=True) test_loss = T.nnet.categorical_crossentropy(test_pred, targets).mean() test_acc = accuracy(test_pred, targets) print("Compiling test function") self._test = theano.function( [input_layer.input_var, targets], [test_loss, test_acc], allow_input_downcast=True) print("Compiling predict function") if prod: pred = get_output(self.output, deterministic=True) else: pred = test_pred self._predict = theano.function( [input_layer.input_var], pred, allow_input_downcast=True)
def multihead_attention(input_sequence, query, key_sequence=None, mask_input=None, num_heads=1,key_size=None,value_size=None, attn_class=DotAttentionLayer, name='multihead_attn', **kwargs): """ A convenience function that computes K attention "heads" in parallel and concatenates them. Each "head" is based on num_heads linear transformations of input sequence, query, and keys :param attn_class: what kind of attention layer to apply in multi-headed mode (Attention or DotAttention) :param num heads: the amount of parallel "heads" :param key_size: num units in attention query and key, defaults to key_sequence.shape[-1] :param value_size: num units in attention values, defaults to input_sequence.shape[-1] :param input_sequence: sequence of inputs to be processed with attention :type input_sequence: lasagne.layers.Layer with shape [batch,seq_length,units] :param query: single time-step state of decoder that is used as query (usually custom layer or lstm/gru/rnn hid) If it matches input_sequence one-step size, query is used as is. Otherwise, DotAttention is performed from DenseLayer(query,input_units,nonlinearity=None). :type query: lasagne.layers.Layer with shape [batch,units] :param key_sequence: a sequence of keys to compute dot_product with. By default, uses input_sequence instead. :type key_sequence: lasagne.layers.Layer with shape [batch,seq_length,units] or None :param mask_input: mask for input_sequence (like other lasagne masks). Default is no mask :type mask_input: lasagne.layers.Layer with shape [batch,seq_length] Heavily inspired by https://arxiv.org/abs/1706.03762 and http://bit.ly/2vsYX0R """ assert len(input_sequence.output_shape) == 3, "input_sequence must be a 3-dimensional (batch,time,units)" assert len(query.output_shape) == 2, "query must be a 2-dimensional for single tick (batch,units)" assert mask_input is None or len( mask_input.output_shape) == 2, "mask_input must be 2-dimensional (batch,time) or None" assert key_sequence is None or len(key_sequence.output_shape) == 3, "key_sequence must be 3-dimensional " \ "of shape (batch,time,units) or None" key_sequence = key_sequence or input_sequence key_size = key_size or key_sequence.output_shape[-1] value_size = value_size or input_sequence.output_shape[-1] def make_broadcasted_heads(incoming,head_size,name=None): ndim = len(incoming.output_shape) assert ndim in (2,3), "incoming must be 2-dimensional (query) or 3-dimensional (key or value)" heads = DenseLayer(incoming,head_size*num_heads,nonlinearity=None, num_leading_axes=ndim-1,name=name) #[batch,time,head_size*num_heads] if ndim == 3: heads = reshape(heads,([0],[1],head_size,num_heads), name=name) #[batch,time,head_size,num_heads] broadcasted_heads = BroadcastLayer(heads, (0, 3), name=name) #[batch*heads,time,head_size] else: #ndim == 2 heads = reshape(heads, ([0], head_size, num_heads), name=name) # [batch,head_size,num_heads] broadcasted_heads = BroadcastLayer(heads, (0, 2), name=name) # [batch*heads, head_size] return broadcasted_heads query_heads = make_broadcasted_heads(query, key_size,name=name + "_query_heads") value_heads = make_broadcasted_heads(input_sequence, value_size, name=name + "_value_heads") if key_sequence is not None: key_heads = make_broadcasted_heads(key_sequence, key_size, name=name + "_key_heads") else: key_heads = None if mask_input is not None: mask_heads = UpcastLayer(mask_input,broadcast_layer=query_heads) else: mask_heads = None attn_heads = attn_class(value_heads,query_heads,key_sequence=key_heads, mask_input=mask_heads,name=name,**kwargs) #[batch*heads,value_size] attn_vectors = UnbroadcastLayer(attn_heads['attn'],broadcast_layer=query_heads) #[batch,value,heads] attn_vectors = flatten(attn_vectors,outdim=2) attn_probs = reshape(attn_heads['probs'],(-1,num_heads,[1])) #[batch,head,probs] return {'attn': attn_vectors, #[batch, value*heads] 'probs': attn_probs}
def get_actor(self, sidx, tidx, valid, avg=False): suf = '_avg' if avg else '' feat_embs = [ self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg) for name in self.args.source_feats ] x = L.concat(feat_embs, axis=2) # (100, 26, 256+32+32+...) if self.args.squeeze: x = L.DenseLayer(x, num_units=self.args.squeeze, name='h0' + suf, num_leading_axes=2, W=HeNormal('relu')) # (100, 26, 256) x = L.flatten(x) # (100, 26*256) h1 = L.DenseLayer(x, num_units=self.args.nh1, name='h1' + suf, W=HeNormal('relu')) # (100, 512) h1 = L.dropout(h1, self.args.dropout) taggers = {} if self.args.aux_tagger: hids = [h1] for name in self.args.target_feats: hid = L.DenseLayer(h1, 256, name='hid-%s%s' % (name, suf), W=HeNormal('relu')) # (100, 512) hids.append(hid) hid = L.dropout(hid, self.args.dropout) # h1 = L.dropout(h1, self.args.dropout) taggers[name] = L.DenseLayer(hid, len(self.manager.feats[name].map), name='tagger-%s' % name, W=HeNormal(), nonlinearity=softmax) # (100, 25) h1 = L.concat(hids, axis=1) h2 = L.DenseLayer(h1, num_units=self.args.nh2, name='h2' + suf, W=HeNormal('relu')) # (100, 256) h2 = L.dropout(h2, self.args.dropout) h3y = L.DenseLayer(h2, num_units=self.args.nh3, name='h3y' + suf, W=HeNormal(), nonlinearity=softmax) # (100, 4) num of actions h3s = L.concat( [h2, h3y], axis=1 ) # (100, 256+4+4), this way shouldn't output <UNK> if its not SHIFT h3z = L.DenseLayer(h2, num_units=self.args.size['label'], name='h3z' + suf, W=HeNormal(), nonlinearity=softmax) # (100, 25) number of labels if avg: set_all_zero([h3y, h3z] + taggers.values()) return h3y, h3z, taggers
def test_memory(game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None,) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states memory_dict = OrderedDict([]) ###Window window_size = 3 # prev state input prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None,) + window.output_shape[2:]) memory_dict[window] = prev_window ###Stack #prev stack stack_w,stack_h = 4, 5 stack_inputs = DenseLayer(observation_reshape,stack_w,name="prev_stack") stack_controls = DenseLayer(observation_reshape,3, nonlinearity=lasagne.nonlinearities.softmax, name="prev_stack") prev_stack = InputLayer((None,stack_h,stack_w), name="previous stack state") stack = StackAugmentation(stack_inputs,prev_stack, stack_controls) memory_dict[stack] = prev_stack stack_top = lasagne.layers.SliceLayer(stack,0,1) ###RNN preset prev_rnn = InputLayer((None,16), name="previous RNN state") new_rnn = RNNCell(prev_rnn,observation_reshape) memory_dict[new_rnn] = prev_rnn ###GRU preset prev_gru = InputLayer((None,16), name="previous GRUcell state") new_gru = GRUCell(prev_gru,observation_reshape) memory_dict[new_gru] = prev_gru ###GRUmemorylayer prev_gru1 = InputLayer((None,15), name="previous GRUcell state") new_gru1 = GRUMemoryLayer(15,observation_reshape,prev_gru1) memory_dict[new_gru1] = prev_gru1 #LSTM with peepholes prev_lstm0_cell = InputLayer((None,13), name="previous LSTMCell hidden state [with peepholes]") prev_lstm0_out = InputLayer((None,13), name="previous LSTMCell output state [with peepholes]") new_lstm0_cell,new_lstm0_out = LSTMCell(prev_lstm0_cell,prev_lstm0_out, input_or_inputs = observation_reshape, peepholes=True,name="newLSTM1 [with peepholes]") memory_dict[new_lstm0_cell] = prev_lstm0_cell memory_dict[new_lstm0_out] = prev_lstm0_out #LSTM without peepholes prev_lstm1_cell = InputLayer((None,14), name="previous LSTMCell hidden state [no peepholes]") prev_lstm1_out = InputLayer((None,14), name="previous LSTMCell output state [no peepholes]") new_lstm1_cell,new_lstm1_out = LSTMCell(prev_lstm1_cell,prev_lstm1_out, input_or_inputs = observation_reshape, peepholes=False,name="newLSTM1 [no peepholes]") memory_dict[new_lstm1_cell] = prev_lstm1_cell memory_dict[new_lstm1_out] = prev_lstm1_out ##concat everything for i in [flatten(window_max),stack_top,new_rnn,new_gru,new_gru1]: print(i.output_shape) all_memory = concat([flatten(window_max),stack_top,new_rnn,new_gru,new_gru1,new_lstm0_out,new_lstm1_out,]) # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(all_memory, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver") # agent agent = Agent(observation_layer, memory_dict, q_eval, resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values obtained via experience replay _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = qlearning.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % ( epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def additional_layer(self, idx_layer, emb_layer, avg=False): suf = '_avg' if avg else '' if self.name == 'char': if self.args.char_model == 'cnn': lds = L.dimshuffle(emb_layer, (0, 3, 1, 2)) # (100, 16, 26, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.conv_dim, (1, n), untie_biases=False, # W=HeNormal('relu') if not avg else Constant(), W=GlorotNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 26, 32-n+1) lpool = L.MaxPool2DLayer(lconv, (1, self.args.max_word_len - n + 1)) # (100, 64, 26, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 26) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 26, 16) ls.append(lpool) xc = L.concat(ls, axis=2, name='echar_concat') # (100, 26, 64) # additional # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2, # W=HeNormal() if not avg else Constant()) # (100, 26, 100) return xc elif self.args.char_model == 'lstm': ml = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_word_len)) # (1500, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( emb_layer, (-1, self.args.max_word_len, self.config['char']['emb_dim'])) # (1500, 32, 16) lstm_f = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (1500, 32) lstm_b = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (1500, 32) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (1500, 64) if self.args.lstm_tagger: xc = L.reshape( xc, (-1, self.args.max_sent_len, 64)) # (100, 161, 64) elif self.args.trans_tagger: xc = L.reshape( xc, (-1, self.args.window_size, 64)) # (100, 15, 64) else: xc = L.reshape(xc, (-1, 26, 64)) # (100, 26, 64) return xc elif self.name == 'morph': # idx (100, 26/161, 16) emb (100, 26/161, 16, 32) if self.args.morph_model == 'max': xm = L.MaxPool2DLayer( emb_layer, (self.args.max_morph_len, 1)) # (100, 26/161, 1, 32) # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32) xm = L.flatten(xm, outdim=3) # (100, 26/161, 32) # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2)) elif self.args.morph_model == 'avg': mask = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # (100, 26, 16) mask = L.dimshuffle(mask, (0, 1, 2, 'x')) # (100, 26, 16, 1) mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat( x, self.config['morph']['emb_dim'], 3)) # (100, 26, 16, 1) xm = L.ElemwiseMergeLayer([ emb_layer, mask ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2)) # (100, 26, 32) # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32) return xm else: return emb_layer
def build_critic(self, critic_input_var, condition_var, vocoder, ctxsize, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, postlayers_nb=6, use_LSweighting=True, LSWGANtransfreqcutoff=4000, LSWGANtranscoef=1.0 / 8.0, use_WGAN_incnoisefeature=False): useLRN = False # TODO layer_critic = ll.InputLayer(shape=(None, None, vocoder.featuressize()), input_var=critic_input_var, name='input') winlen = int(0.5 * self._windur / 0.005) * 2 + 1 layerstoconcats = [] # Amplitude spectrum layer = ll.SliceLayer(layer_critic, indices=slice( vocoder.f0size(), vocoder.f0size() + vocoder.specsize()), axis=2, name='spec_slice') # Assumed feature order if use_LSweighting: # Using weighted WGAN+LS print( 'WGAN Weighted LS - critic - SPEC (trans cutoff {}Hz)'.format( LSWGANtransfreqcutoff)) # wganls_spec_weights_ = nonlin_sigmoidparm(np.arange(vocoder.specsize(), dtype=theano.config.floatX), int(LSWGANtransfreqcutoff*vocoder.specsize()), LSWGANtranscoef) wganls_spec_weights_ = nonlin_sigmoidparm( np.arange(vocoder.specsize(), dtype=theano.config.floatX), sp.freq2fwspecidx(LSWGANtransfreqcutoff, vocoder.fs, vocoder.specsize()), LSWGANtranscoef) wganls_weights = theano.shared( value=np.asarray(wganls_spec_weights_), name='wganls_spec_weights_') layer = CstMulLayer(layer, cstW=wganls_weights, name='cstdot_wganls_weights') layer = ll.dimshuffle(layer, [0, 'x', 1, 2], name='spec_dimshuffle') for layi in xrange(self._nbcnnlayers): layerstr = 'spec_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._spec_freqlen) # strides>1 make the first two Conv layers pyramidal. Increase patches' effects here and there, bad. layer = layer_GatedConv2DLayer(layer, self._nbfilters, [winlen, self._spec_freqlen], pad='same', nonlinearity=nonlinearity, name=layerstr) if useLRN: layer = ll.LocalResponseNormalization2DLayer(layer) layer = ll.dimshuffle(layer, [0, 2, 3, 1], name='spec_dimshuffle') layer_spec = ll.flatten(layer, outdim=3, name='spec_flatten') layerstoconcats.append(layer_spec) if use_WGAN_incnoisefeature and vocoder.noisesize( ) > 0: # Add noise in critic layer = ll.SliceLayer(layer_critic, indices=slice( vocoder.f0size() + vocoder.specsize(), vocoder.f0size() + vocoder.specsize() + vocoder.noisesize()), axis=2, name='nm_slice') if use_LSweighting: # Using weighted WGAN+LS print('WGAN Weighted LS - critic - NM (trans cutoff {}Hz)'. format(LSWGANtransfreqcutoff)) # wganls_spec_weights_ = nonlin_sigmoidparm(np.arange(vocoder.noisesize(), dtype=theano.config.floatX), int(LSWGANtransfreqcutoff*vocoder.noisesize()), LSWGANtranscoef) wganls_spec_weights_ = nonlin_sigmoidparm( np.arange(vocoder.noisesize(), dtype=theano.config.floatX), sp.freq2fwspecidx(LSWGANtransfreqcutoff, vocoder.fs, vocoder.noisesize()), LSWGANtranscoef) wganls_weights = theano.shared( value=np.asarray(wganls_spec_weights_), name='wganls_spec_weights_') layer = CstMulLayer(layer, cstW=wganls_weights, name='cstdot_wganls_weights') layer = ll.dimshuffle(layer, [0, 'x', 1, 2], name='nm_dimshuffle') for layi in xrange(np.max( (1, int(np.ceil(self._nbcnnlayers / 2))))): layerstr = 'nm_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._noise_freqlen) layer = layer_GatedConv2DLayer(layer, self._nbfilters, [winlen, self._noise_freqlen], pad='same', nonlinearity=nonlinearity, name=layerstr) if useLRN: layer = ll.LocalResponseNormalization2DLayer(layer) layer = ll.dimshuffle(layer, [0, 2, 3, 1], name='nm_dimshuffle') layer_bndnm = ll.flatten(layer, outdim=3, name='nm_flatten') layerstoconcats.append(layer_bndnm) # Add the contexts layer_ctx_input = ll.InputLayer(shape=(None, None, ctxsize), input_var=condition_var, name='ctx_input') layer_ctx = layer_context(layer_ctx_input, ctx_nblayers=self._ctx_nblayers, ctx_nbfilters=self._ctx_nbfilters, ctx_winlen=self._ctx_winlen, hiddensize=self._hiddensize, nonlinearity=nonlinearity, bn_axes=None, bn_cnn_axes=None, critic=True, useLRN=useLRN) layerstoconcats.append(layer_ctx) # Concatenate the features analysis with the contexts... layer = ll.ConcatLayer(layerstoconcats, axis=2, name='ctx_features.concat') # ... and finalize with a common FC network for layi in xrange(postlayers_nb): layerstr = 'post.l' + str(1 + layi) + '_FC' + str(self._hiddensize) layer = ll.DenseLayer(layer, self._hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) # output layer (linear) layer = ll.DenseLayer(layer, 1, nonlinearity=None, num_leading_axes=2, name='projection') # No nonlin for this output return [layer, layer_critic, layer_ctx_input]
def __init__(self, insize, vocoder, hiddensize=256, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, ctx_nblayers=1, ctx_nbfilters=2, ctx_winlen=21, nbcnnlayers=8, nbfilters=16, spec_freqlen=5, noise_freqlen=5, windur=0.025, bn_axes=None, noisesize=100): if bn_axes is None: bn_axes = [0, 1] model.Model.__init__(self, insize, vocoder, hiddensize) self._ctx_nblayers = ctx_nblayers self._ctx_nbfilters = ctx_nbfilters self._ctx_winlen = ctx_winlen self._nbcnnlayers = nbcnnlayers self._nbfilters = nbfilters self._spec_freqlen = spec_freqlen self._noise_freqlen = noise_freqlen self._windur = windur winlen = int(0.5 * self._windur / 0.005) * 2 + 1 layer_ctx_input = ll.InputLayer(shape=(None, None, insize), input_var=self._input_values, name='ctx.input') layer_noise_input = UniformNoiseLayer(layer_ctx_input, noisesize, name='noise.input') layer_ctx_input = ll.ConcatLayer( (layer_ctx_input, layer_noise_input), axis=2, name='concat.input') # TODO Put the noise later on self._layer_ctx = layer_context(layer_ctx_input, ctx_nblayers=self._ctx_nblayers, ctx_nbfilters=self._ctx_nbfilters, ctx_winlen=self._ctx_winlen, hiddensize=self._hiddensize, nonlinearity=nonlinearity, bn_axes=[0, 1], bn_cnn_axes=[0, 2, 3]) layers_toconcat = [] if vocoder.f0size() > 0: # F0 - BLSTM layer layer_f0 = self._layer_ctx grad_clipping = 50 for layi in xrange(1): layerstr = 'f0_l' + str(1 + layi) + '_BLSTM{}'.format( self._hiddensize) fwd = models_basic.layer_LSTM(layer_f0, self._hiddensize, nonlinearity, backwards=False, grad_clipping=grad_clipping, name=layerstr + '.fwd') bck = models_basic.layer_LSTM(layer_f0, self._hiddensize, nonlinearity, backwards=True, grad_clipping=grad_clipping, name=layerstr + '.bck') layer_f0 = ll.ConcatLayer((fwd, bck), axis=2, name=layerstr + '.concat') # TODO Replace by CNN ?? It didn't work well, maybe didn't work well with WGAN loss, but f0 is not more on WGAN loss layer_f0 = ll.DenseLayer(layer_f0, num_units=vocoder.f0size(), nonlinearity=None, num_leading_axes=2, name='f0_lout_projection') layers_toconcat.append(layer_f0) if vocoder.specsize() > 0: # Amplitude spectrum - 2D Gated Conv layers layer_spec_proj = ll.batch_norm(ll.DenseLayer( self._layer_ctx, vocoder.specsize(), nonlinearity=nonlinearity, num_leading_axes=2, name='spec_projection'), axes=bn_axes) # layer_spec_proj = ll.DenseLayer(self._layer_ctx, vocoder.specsize(), nonlinearity=None, num_leading_axes=2, name='spec_projection') layer_spec = ll.dimshuffle(layer_spec_proj, [0, 'x', 1, 2], name='spec_dimshuffle') for layi in xrange(nbcnnlayers): layerstr = 'spec_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._spec_freqlen) layer_spec = ll.batch_norm( layer_GatedConv2DLayer(layer_spec, self._nbfilters, [winlen, self._spec_freqlen], stride=1, pad='same', nonlinearity=nonlinearity, name=layerstr)) layer_spec = ll.Conv2DLayer(layer_spec, 1, [winlen, self._spec_freqlen], pad='same', nonlinearity=None, name='spec_lout_2DC') layer_spec = ll.dimshuffle(layer_spec, [0, 2, 3, 1], name='spec_dimshuffle') layer_spec = ll.flatten(layer_spec, outdim=3, name='spec_flatten') # layer_spec = ll.ElemwiseSumLayer([layer_spec, layer_spec_proj], name='skip') layers_toconcat.append(layer_spec) if vocoder.noisesize() > 0: layer_noise = self._layer_ctx for layi in xrange(np.max((1, int(np.ceil(nbcnnlayers / 2))))): layerstr = 'noise_l' + str(1 + layi) + '_FC{}'.format(hiddensize) layer_noise = ll.DenseLayer(layer_noise, num_units=hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) if isinstance(vocoder, vocoders.VocoderPML): layer_noise = ll.DenseLayer( layer_noise, num_units=vocoder.nm_size, nonlinearity=lasagne.nonlinearities.sigmoid, num_leading_axes=2, name='lo_noise' ) # sig is best among nonlin_saturatedsigmoid nonlin_tanh_saturated nonlin_tanh_bysigmoid else: layer_noise = ll.DenseLayer(layer_noise, num_units=vocoder.nm_size, nonlinearity=None, num_leading_axes=2, name='lo_noise') layers_toconcat.append(layer_noise) if vocoder.vuvsize() > 0: # VUV - BLSTM layer layer_vuv = self._layer_ctx grad_clipping = 50 for layi in xrange(1): layerstr = 'vuv_l' + str(1 + layi) + '_BLSTM{}'.format( self._hiddensize) fwd = models_basic.layer_LSTM(layer_vuv, self._hiddensize, nonlinearity, backwards=False, grad_clipping=grad_clipping, name=layerstr + '.fwd') bck = models_basic.layer_LSTM(layer_vuv, self._hiddensize, nonlinearity, backwards=True, grad_clipping=grad_clipping, name=layerstr + '.bck') layer_vuv = ll.ConcatLayer((fwd, bck), axis=2, name=layerstr + '.concat') layer_vuv = ll.DenseLayer(layer_vuv, num_units=vocoder.vuvsize(), nonlinearity=None, num_leading_axes=2, name='vuv_lout_projection') layers_toconcat.append(layer_vuv) layer = ll.ConcatLayer(layers_toconcat, axis=2, name='lout.concat') self.init_finish( layer ) # Has to be called at the end of the __init__ to print out the architecture, get the trainable params, etc.
def create_dadgm_model(self, X, Y, n_dim, n_out, n_chan=1, n_class=10): n_cat = 20 # number of categorical distributions n_lat = n_class * n_cat # latent stochastic variables n_aux = 10 # number of auxiliary variables n_hid = 500 # size of hidden layer in encoder/decoder n_in = n_out = n_dim * n_dim * n_chan tau = self.tau hid_nl = T.nnet.relu relu_shift = lambda av: T.nnet.relu(av + 10) - 10 # create the encoder network # - create q(a|x) qa_net_in = InputLayer(shape=(None, n_in), input_var=X) qa_net = DenseLayer( qa_net_in, num_units=n_hid, W=GlorotNormal('relu'), b=Normal(1e-3), nonlinearity=hid_nl, ) qa_net_mu = DenseLayer( qa_net, num_units=n_aux, W=GlorotNormal(), b=Normal(1e-3), nonlinearity=None, ) qa_net_logsigma = DenseLayer( qa_net, num_units=n_aux, W=GlorotNormal(), b=Normal(1e-3), nonlinearity=relu_shift, ) qa_net_sample = GaussianSampleLayer(qa_net_mu, qa_net_logsigma) # - create q(z|a, x) qz_net_in = lasagne.layers.InputLayer((None, n_aux)) qz_net_a = DenseLayer( qz_net_in, num_units=n_hid, nonlinearity=hid_nl, ) qz_net_b = DenseLayer( qa_net_in, num_units=n_hid, nonlinearity=hid_nl, ) qz_net = ElemwiseSumLayer([qz_net_a, qz_net_b]) qz_net = DenseLayer(qz_net, num_units=n_hid, nonlinearity=hid_nl) qz_net_mu = DenseLayer( qz_net, num_units=n_lat, nonlinearity=None, ) qz_net_mu = reshape(qz_net_mu, (-1, n_class)) qz_net_sample = GumbelSoftmaxSampleLayer(qz_net_mu, tau) qz_net_sample = reshape(qz_net_sample, (-1, n_cat, n_class)) # create the decoder network # - create p(x|z) px_net_in = lasagne.layers.InputLayer((None, n_cat, n_class)) # --- rest is created from RBM --- # - create p(a|z) pa_net = DenseLayer( flatten(px_net_in), num_units=n_hid, W=GlorotNormal('relu'), b=Normal(1e-3), nonlinearity=hid_nl, ) pa_net_mu = DenseLayer( pa_net, num_units=n_aux, W=GlorotNormal(), b=Normal(1e-3), nonlinearity=None, ) pa_net_logsigma = DenseLayer( pa_net, num_units=n_aux, W=GlorotNormal(), b=Normal(1e-3), nonlinearity=relu_shift, ) # save network params self.n_cat = n_cat self.input_layers = (qa_net_in, qz_net_in, px_net_in) return pa_net_mu, pa_net_logsigma, qz_net_mu, \ qa_net_mu, qa_net_logsigma, qz_net_sample, qa_net_sample,
def __init__(self, gate_controllers, channels, gate_nonlinearities=nonlinearities.sigmoid, bias_init=init.Constant(), weight_init=init.Normal(), **kwargs): """ An overly generic interface for one-step gate, stacked gates or gate applier. If several channels are given, stacks them for quicker execution. gate_controllers - a single layer or a list/tuple of such layers that gate depends on (for most RNNs, that's input and previous memory state) channels - a single layer or integer or a list/tuple of layers/integers if a layer, that defines a layer that should be multiplied by the gate output if an integer - that defines a number of units of a gate -- and these are the units to be returned gate_nonlinearities - a single function or a list of such(channel-wise), - defining nonlinearities for gates on corresponding channels bias_init - an initializer or a list (channel-wise) of initializers for bias(b) parameters - (None, lasagne.init, theano variable or numpy array) - None means no bias weight init - an initializer OR a list of initializers for (channel-wise) - OR a list of lists of initializers (channel, controller) - (lasagne.init, theano variable or numpy array) """ self.channels = check_list(channels) self.gate_controllers = check_list(gate_controllers) # check channel types for chl in self.channels: assert is_layer(chl) or (type(chl) == int) # separate layers from non-layers self.channel_layers = list(filter(is_layer, self.channels)) self.channel_ints = [v for v in self.channels if not is_layer(v)] # flatten layers to 2 dimensions for i in range(len(self.channel_layers)): layer = self.channel_layers[i] if type(layer) == int: continue lname = layer.name or "" if len(layer.output_shape) != 2: warn( "One of the channels (name='%s') has an input dimension of %s and will be flattened." % (lname, layer.output_shape)) self.channel_layers[i] = flatten(layer, outdim=2, name=lname) assert len(self.channel_layers[i].output_shape) == 2 # flatten layers to 2 dimensions for i in range(len(self.gate_controllers)): layer = self.gate_controllers[i] lname = layer.name or "" if len(layer.output_shape) != 2: warn( "One of the gate controllers (name='%s') has an input dimension of %s and will be flattened." % (lname, layer.output_shape)) self.gate_controllers[i] = flatten(layer, outdim=2, name=lname) assert len(self.gate_controllers[i].output_shape) == 2 # initialize merge layer incomings = self.channel_layers + self.gate_controllers # default name kwargs["name"] = kwargs.get("name", "YetAnother" + self.__class__.__name__) output_names = [ "%s.channel.%i" % (kwargs["name"], i) for i in range(len(self.channels)) ] # determine whether or not user defined a fixed batch size batch_sizes = [ chl.output_shape[0] for chl in filter(is_layer, self.channels) ] batch_size = reduce(lambda a, b: a or b, batch_sizes, None) output_shapes = [ chl.output_shape if is_layer(chl) else (batch_size, chl) for chl in self.channels ] output_shapes = OrderedDict(zip(output_names, output_shapes)) output_dtypes = [get_layer_dtype(chl) for chl in self.channels] output_dtypes = OrderedDict(zip(output_names, output_dtypes)) super(GateLayer, self).__init__(incomings, output_shapes=output_shapes, output_dtypes=output_dtypes, **kwargs) # nonlinearities self.gate_nonlinearities = check_list(gate_nonlinearities) self.gate_nonlinearities = [(nl if (nl is not None) else (lambda v: v)) for nl in self.gate_nonlinearities] # must be either one common nonlinearity or one per channel assert len(self.gate_nonlinearities) in (1, len(self.channels)) if len(self.gate_nonlinearities) == 1: self.gate_nonlinearities *= len(self.channels) # cast bias init to a list bias_init = check_list(bias_init) assert len(bias_init) in (1, len(self.channels)) if len(bias_init) == 1: bias_init *= len(self.channels) # cast weight init to a list of lists [channel][controller] weight_init = check_list(weight_init) assert len(weight_init) in (1, len(self.channels)) if len(weight_init) == 1: weight_init *= len(self.channels) for i in range(len(self.channels)): weight_init[i] = check_list(weight_init[i]) assert len(weight_init[i]) in (1, len(self.gate_controllers)) if len(weight_init[i]) == 1: weight_init[i] *= len(self.gate_controllers) self.gate_b = [] # a list of biases for channels self.gate_W = [list() for _ in self.gate_controllers ] # a list of lists of weights [controller][channel] for chl_i, (channel, b_init, channel_w_inits) in enumerate( zip(self.channels, bias_init, weight_init)): if is_layer(channel): channel_name = channel.name or "chl" + str(chl_i) channel_n_units = channel.output_shape[1] else: channel_name = "chl" + str(chl_i) channel_n_units = channel # add bias if b_init is not None: self.gate_b.append( self.add_param(spec=b_init, shape=(channel_n_units, ), name="b_%s" % (channel_name))) else: self.gate_b.append(T.zeros((channel_n_units, ))) # add weights for ctrl_i, (controller, w_init) in enumerate( zip(self.gate_controllers, channel_w_inits)): ctrl_name = controller.name or "ctrl" + str(ctrl_i) # add bias self.gate_W[ctrl_i].append( self.add_param(spec=w_init, shape=(controller.output_shape[1], channel_n_units), name="W_%s_%s" % (ctrl_name, channel_name))) # a list where i-th element contains weights[i-th_gate_controller] for all outputs stacked self.gate_W_stacked = [ T.concatenate(weights, axis=1) for weights in self.gate_W ] # a list of biases for the respective outputs stacked self.gate_b_stacked = T.concatenate(self.gate_b)
def build_critic(input_var=None, cond_var=None, n_conds=0, arch=0, with_BatchNorm=True, loss_type='wgan'): from lasagne.layers import ( InputLayer, Conv2DLayer, DenseLayer, MaxPool2DLayer, concat, dropout, flatten) from lasagne.nonlinearities import rectify, LeakyRectify from lasagne.init import GlorotUniform # Normal lrelu = LeakyRectify(0.2) layer = InputLayer( shape=(None, 1, 128, 128), input_var=input_var, name='d_in_data') # init = Normal(0.02, 0.0) init = GlorotUniform() if cond_var: # class: from data or from generator input layer_cond = InputLayer( shape=(None, n_conds), input_var=cond_var, name='d_in_condition') layer_cond = BatchNorm(DenseLayer( layer_cond, 1024, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) if arch == 'dcgan': # DCGAN inspired layer = BatchNorm(Conv2DLayer( layer, 32, 4, stride=2, pad=1, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 64, 4, stride=2, pad=1, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 128, 4, stride=2, pad=1, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 256, 4, stride=2, pad=1, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 512, 4, stride=2, pad=1, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) elif arch == 'cont-enc': # convolution layers layer = BatchNorm(Conv2DLayer( layer, 64, 4, stride=2, pad=1, W=init, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 64, 4, stride=2, pad=1, W=init, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 128, 4, stride=2, pad=1, W=init, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 256, 4, stride=2, pad=1, W=init, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 512, 4, stride=2, pad=1, W=init, nonlinearity=lrelu), with_BatchNorm) elif arch == 'mnist': # Jan Schluechter's MNIST discriminator # convolution layers layer = BatchNorm(Conv2DLayer( layer, 128, 5, stride=2, pad='same', W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 128, 5, stride=2, pad='same', W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 128, 5, stride=2, pad='same', W=init, b=None, nonlinearity=lrelu), with_BatchNorm) # layer = BatchNorm(Conv2DLayer( # layer, 128, 5, stride=2, pad='same', W=init, b=None, # nonlinearity=lrelu), with_BatchNorm) # fully-connected layer # layer = BatchNorm(DenseLayer( # layer, 1024, W=init, b=None, nonlinearity=lrelu), with_BatchNorm) elif arch == 'lsgan': layer = batch_norm(Conv2DLayer( layer, 256, 5, stride=2, pad='same', nonlinearity=lrelu)) layer = batch_norm(Conv2DLayer( layer, 256, 5, stride=2, pad='same', nonlinearity=lrelu)) layer = batch_norm(Conv2DLayer( layer, 256, 5, stride=2, pad='same', nonlinearity=lrelu)) elif arch == 'crepe': # CREPE # form words from sequence of characters layer = BatchNorm(Conv2DLayer( layer, 1024, (128, 7), W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = MaxPool2DLayer(layer, (1, 3)) # temporal convolution, 7-gram layer = BatchNorm(Conv2DLayer( layer, 512, (1, 7), W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = MaxPool2DLayer(layer, (1, 3)) # temporal convolution, 3-gram layer = BatchNorm(Conv2DLayer( layer, 256, (1, 3), W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 256, (1, 3), W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 256, (1, 3), W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = BatchNorm(Conv2DLayer( layer, 256, (1, 3), W=init, b=None, nonlinearity=lrelu), with_BatchNorm) layer = flatten(layer) # fully-connected layers layer = dropout(DenseLayer( layer, 1024, W=init, b=None, nonlinearity=rectify)) layer = dropout(DenseLayer( layer, 1024, W=init, b=None, nonlinearity=rectify)) else: raise Exception("Model architecture {} is not supported".format(arch)) # output layer (linear and without bias) if cond_var is not None: layer = DenseLayer(layer, 1024, nonlinearity=lrelu, b=None) layer = concat([layer, layer_cond]) layer = DenseLayer(layer, 1, b=None, nonlinearity=None) print("Critic output:", layer.output_shape) return layer
def __init__(self, num_units, observation_input, prev_state_input, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), grad_clipping=5., **kwargs): assert len(prev_state_input.output_shape) == 2 if len(observation_input.output_shape) != 2: observation_input = flatten(observation_input, outdim=2) assert len(observation_input.output_shape) == 2 # default name if "name" not in kwargs: kwargs["name"] = "YetAnother" + self.__class__.__name__ self.num_units = num_units super(GRUMemoryLayer, self).__init__([prev_state_input, observation_input], **kwargs) self.grad_clipping = grad_clipping # Retrieve the dimensionality of the incoming layer last_state_shape, observation_shape = self.input_shapes # Input dimensionality is the output dimensionality of the input layer last_num_units = np.prod(last_state_shape[1:]) inp_num_inputs = np.prod(observation_shape[1:]) # hidden shapes must match assert last_num_units == self.num_units def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (inp_num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units,), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params(updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params(resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params( hidden_update, 'hidden_update') # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation self.W_in_stacked = T.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices self.W_hid_stacked = T.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector self.b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)
def __init__(self, num_units, observation_input, prev_state_input, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), grad_clipping=5., **kwargs): assert len(prev_state_input.output_shape) == 2 if len(observation_input.output_shape) != 2: observation_input = flatten(observation_input, outdim=2) assert len(observation_input.output_shape) == 2 # default name if "name" not in kwargs: kwargs["name"] = "YetAnother" + self.__class__.__name__ self.num_units = num_units super(GRUMemoryLayer, self).__init__([prev_state_input, observation_input], **kwargs) self.grad_clipping = grad_clipping # Retrieve the dimensionality of the incoming layer last_state_shape, observation_shape = self.input_shapes # Input dimensionality is the output dimensionality of the input layer last_num_units = np.prod(last_state_shape[1:]) inp_num_inputs = np.prod(observation_shape[1:]) # hidden shapes must match assert last_num_units == self.num_units def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (inp_num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params( updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params( resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params(hidden_update, 'hidden_update') # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation self.W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices self.W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units) vector self.b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)
def __init__(self, num_units, observation_input, prev_state_input, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), bias_init=init.Constant(), weight_init=init.Normal(), grad_clipping=5., **kwargs): """ a Gated Recurrent Unit implementation of a memory layer. Unlike lasagne.layers.GRUlayer, this layer does not produce the whole time series at a time, but yields it's next state given last state and observation one tick at a time. This is done to simplify usage within external loops along with other MDP components. parameters: - num_units: amount of units in the hidden state. - If you are using prev_state_input, put anything here. - observation_input - a lasagne layer that provides float[batch_id, input_id]: input observation at this tick -- as an output. - prev_state_input [optional] - a lasagne layer that generates the previous batch of hidden states (in case you wish several layers to handle the same sequence) - concatenate_input: if true, appends observation_input of current tick to own activation at this tick instance that - generates first (a-priori) agent state - determines new agent state given previous agent state and an observation|previous input """ assert len(prev_state_input.output_shape) == 2 if len(observation_input.output_shape) != 2: observation_input = flatten(observation_input, outdim=2) assert len(observation_input.output_shape) == 2 # default name if "name" not in kwargs: kwargs["name"] = "YetAnother" + self.__class__.__name__ self.num_units = num_units super(GRUMemoryLayer, self).__init__([prev_state_input, observation_input], **kwargs) self.grad_clipping = grad_clipping # Retrieve the dimensionality of the incoming layer last_state_shape, observation_shape = self.input_shapes # Input dimensionality is the output dimensionality of the input layer last_num_units = np.prod(last_state_shape[1:]) inp_num_inputs = np.prod(observation_shape[1:]) # hidden shapes must match assert last_num_units == self.num_units def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (inp_num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params( updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params( resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params(hidden_update, 'hidden_update') # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation self.W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices self.W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units) vector self.b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)
def test_memory( game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None, ) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states memory_dict = OrderedDict([]) ###Window window_size = 3 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) memory_dict[window] = prev_window ###Stack #prev stack stack_w, stack_h = 4, 5 stack_inputs = DenseLayer(observation_reshape, stack_w, name="prev_stack") stack_controls = DenseLayer(observation_reshape, 3, nonlinearity=lasagne.nonlinearities.softmax, name="prev_stack") prev_stack = InputLayer((None, stack_h, stack_w), name="previous stack state") stack = StackAugmentation(stack_inputs, prev_stack, stack_controls) memory_dict[stack] = prev_stack stack_top = lasagne.layers.SliceLayer(stack, 0, 1) ###RNN preset prev_rnn = InputLayer((None, 16), name="previous RNN state") new_rnn = RNNCell(prev_rnn, observation_reshape) memory_dict[new_rnn] = prev_rnn ###GRU preset prev_gru = InputLayer((None, 16), name="previous GRUcell state") new_gru = GRUCell(prev_gru, observation_reshape) memory_dict[new_gru] = prev_gru ###GRUmemorylayer prev_gru1 = InputLayer((None, 15), name="previous GRUcell state") new_gru1 = GRUMemoryLayer(15, observation_reshape, prev_gru1) memory_dict[new_gru1] = prev_gru1 #LSTM with peepholes prev_lstm0_cell = InputLayer( (None, 13), name="previous LSTMCell hidden state [with peepholes]") prev_lstm0_out = InputLayer( (None, 13), name="previous LSTMCell output state [with peepholes]") new_lstm0_cell, new_lstm0_out = LSTMCell( prev_lstm0_cell, prev_lstm0_out, input_or_inputs=observation_reshape, peepholes=True, name="newLSTM1 [with peepholes]") memory_dict[new_lstm0_cell] = prev_lstm0_cell memory_dict[new_lstm0_out] = prev_lstm0_out #LSTM without peepholes prev_lstm1_cell = InputLayer( (None, 14), name="previous LSTMCell hidden state [no peepholes]") prev_lstm1_out = InputLayer( (None, 14), name="previous LSTMCell output state [no peepholes]") new_lstm1_cell, new_lstm1_out = LSTMCell( prev_lstm1_cell, prev_lstm1_out, input_or_inputs=observation_reshape, peepholes=False, name="newLSTM1 [no peepholes]") memory_dict[new_lstm1_cell] = prev_lstm1_cell memory_dict[new_lstm1_out] = prev_lstm1_out ##concat everything for i in [flatten(window_max), stack_top, new_rnn, new_gru, new_gru1]: print(i.output_shape) all_memory = concat([ flatten(window_max), stack_top, new_rnn, new_gru, new_gru1, new_lstm0_out, new_lstm1_out, ]) # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(all_memory, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver") # agent agent = Agent(observation_layer, memory_dict, q_eval, resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [ np.zeros((batch_size, ) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states ] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact( step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values obtained via experience replay _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = qlearning.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function( [], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % (epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def __init__(self, num_units, observation_input, prev_state_input, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), grad_clipping=5., **kwargs): """ a Gated Recurrent Unit implementation of a memory layer. Unlike lasagne.layers.GRUlayer, this layer does not produce the whole time series at a time, but yields it's next state given last state and observation one tick at a time. This is done to simplify usage within external loops along with other MDP components. parameters: - num_units: amount of units in the hidden state. - If you are using prev_state_input, put anything here. - observation_input - a lasagne layer that provides float[batch_id, input_id]: input observation at this tick -- as an output. - prev_state_input [optional] - a lasagne layer that generates the previous batch of hidden states (in case you wish several layers to handle the same sequence) - concatenate_input: if true, appends observation_input of current tick to own activation at this tick instance that - generates first (a-priori) agent state - determines new agent state given previous agent state and an observation|previous input """ assert len(prev_state_input.output_shape) ==2 if len(observation_input.output_shape) !=2: observation_input = flatten(observation_input,outdim=2) assert len(observation_input.output_shape) == 2 #default name if "name" not in kwargs: kwargs["name"] = "YetAnother"+self.__class__.__name__ self.num_units = num_units super(GRUMemoryLayer, self).__init__([prev_state_input,observation_input], **kwargs) self.grad_clipping = grad_clipping # Retrieve the dimensionality of the incoming layer last_state_shape, observation_shape = self.input_shapes # Input dimensionality is the output dimensionality of the input layer last_num_units = np.prod(last_state_shape[1:]) inp_num_inputs = np.prod(observation_shape[1:]) #hidden shapes must match assert last_num_units == self.num_units def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (inp_num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units,), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params(updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params(resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params( hidden_update, 'hidden_update') # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation self.W_in_stacked = T.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices self.W_hid_stacked = T.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector self.b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)
def create_nn(): ''' Returns the theano function - train,test Returns the 'X-KerasNet' Using default values of adam - learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08 Input to the NN is (batch_size,3,32,32) and corresponding classes it belong to (batch_size,) ''' a_l_in = InputLayer((batch_size,1,32,32)) a_l_in_bn = BatchNormLayer(a_l_in) b_l_in = InputLayer((batch_size,1,32,32)) b_l_in_bn = BatchNormLayer(b_l_in) c_l_in = InputLayer((batch_size,1,32,32)) c_l_in_bn = BatchNormLayer(c_l_in) a_conv1 = Conv2DLayer(a_l_in_bn,pad='same',num_filters=32,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x32x32 b_conv1 = Conv2DLayer(b_l_in_bn,pad='same',num_filters=16,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx16x32x32 c_conv1 = Conv2DLayer(c_l_in_bn,pad='same',num_filters=16,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx16x32x32 a_conv1_1 = Conv2DLayer(a_conv1,pad='same',num_filters=32,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x32x32 b_conv1_1 = Conv2DLayer(b_conv1,pad='same',num_filters=16,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx16x32x32 c_conv1_1 = Conv2DLayer(c_conv1,pad='same',num_filters=16,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx16x32x32 a_mp1 = MaxPool2DLayer(a_conv1_1,pool_size=(2,2)) #Bx32x16x16 b_mp1 = MaxPool2DLayer(b_conv1_1,pool_size=(2,2)) #Bx16x16x16 c_mp1 = MaxPool2DLayer(c_conv1_1,pool_size=(2,2)) #Bx16x16x16 a_do1 = dropout(a_mp1,p=0.25) #Bx32x16x16 b_do1 = dropout(b_mp1,p=0.25) #Bx16x16x16 c_do1 = dropout(c_mp1,p=0.25) #Bx16x16x16 #Exchange of feature maps a_to_bc = Conv2DLayer(a_do1,pad='same',num_filters=32,filter_size=(1,1),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x16x16 b_to_a = Conv2DLayer(b_do1,pad='same',num_filters=16,filter_size=(1,1),nonlinearity=lasagne.nonlinearities.rectify) #Bx16x16x16 c_to_a = Conv2DLayer(c_do1,pad='same',num_filters=16,filter_size=(1,1),nonlinearity=lasagne.nonlinearities.rectify) #Bx16x16x16 #Merging a_merge1 = lasagne.layers.ConcatLayer([a_do1,b_to_a,c_to_a]) #Bx64x16x16 b_merge1 = lasagne.layers.ConcatLayer([b_do1,a_to_bc]) #Bx48x16x16 c_merge1 = lasagne.layers.ConcatLayer([c_do1,a_to_bc]) #Bx48x16x16 a_conv2 = Conv2DLayer(a_merge1,pad='same',num_filters=64,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx64x16x16 b_conv2 = Conv2DLayer(b_merge1,pad='same',num_filters=32,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x16x16 c_conv2 = Conv2DLayer(c_merge1,pad='same',num_filters=32,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x16x16 a_conv2_1 = Conv2DLayer(a_conv2,pad='same',num_filters=64,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx64x16x16 b_conv2_1 = Conv2DLayer(b_conv2,pad='same',num_filters=32,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x16x16 c_conv2_1 = Conv2DLayer(c_conv2,pad='same',num_filters=32,filter_size=(3,3),nonlinearity=lasagne.nonlinearities.rectify) #Bx32x16x16 a_mp2 = MaxPool2DLayer(a_conv2_1,pool_size=(2,2)) #Bx64x8x8 b_mp2 = MaxPool2DLayer(b_conv2_1,pool_size=(2,2)) #Bx32x8x8 c_mp2 = MaxPool2DLayer(c_conv2_1,pool_size=(2,2)) #Bx32x8x8 a_do2 = dropout(a_mp2,p=0.25) #Bx64x8x8 b_do2 = dropout(b_mp2,p=0.25) #Bx32x8x8 c_do2 = dropout(c_mp2,p=0.25) #Bx32x8x8 #Final Merge merge2 = lasagne.layers.ConcatLayer([a_do2,b_do2,c_do2]) #Bx128x8x8 flat = flatten(merge2,2) #Bx8192 fc = DenseLayer(flat,num_units=512,nonlinearity=lasagne.nonlinearities.rectify) #Bx512 fc_do = dropout(fc, p=0.5) network = DenseLayer(fc_do, num_units=nb_classes, nonlinearity=lasagne.nonlinearities.softmax) #Bxnb_classes net_output = lasagne.layers.get_output(network) true_output = T.matrix() all_params = lasagne.layers.get_all_params(network,trainable=True) loss = T.mean(lasagne.objectives.categorical_crossentropy(net_output,true_output)) updates = lasagne.updates.adam(loss,all_params) train = theano.function(inputs= [a_l_in.input_var,b_l_in.input_var,c_l_in.input_var,true_output] , outputs=[net_output,loss], updates = updates) test = theano.function(inputs= [l_in.input_var], outputs= [net_output]) return train,test,network