def addDropoutLayer(self, **kwargs): """ Add Dropout layer. """ input_layer = self.input_layer if not self.all_layers \ else self.all_layers[-1] self.n_dropout_layers += 1 name = "dropout%i" % self.n_dropout_layers new_layer = DropoutLayer(input_layer, name=name, **kwargs) self.all_layers += (new_layer, )
def main(): train, test, vadilation = load_mnist_simple() # x, y = train[0] # print("x: ", x.shape) # print("y: ", y) with timing(f""): # dnn = DNN(input=28 * 28, layers=[Layer(30, LQ), Layer(10, LCE)], eta=0.05) # 96% # dnn = DNN(input=28 * 28, layers=[Layer(30, LQ), Layer(10, SM)], eta=0.001) # 68% # dnn = DNN(input=28 * 28, layers=[Layer(100, LQ), Layer(10, LCE)], eta=0.05, lmbda=5) # 98% # dnn = DNN(input=28 * 28, layers=[DropoutLayer(100, LQ), Layer(10, LCE)], eta=0.05) # 97.5% dnn = DNN(input=28 * 28, layers=[DropoutLayer(160, LQ), Layer(10, LCE)], eta=0.05, lmbda=3) dnn.initialize_rand() dnn.learn(train, epochs=30, test=vadilation, batch_size=29) print('test:', dnn.test(test)) print(dnn.stats())
def __init__(self, num_input=256, num_hidden=[512,512], num_output=256, clip_at=0.0, scale_norm=0.0): X = T.fmatrix() Y = T.imatrix() lr = T.fscalar() alpha = T.fscalar() reg = T.fscalar() dropout_prob = T.fscalar() self.num_input = num_input self.num_hidden = num_hidden self.num_output = num_output self.clip_at = clip_at self.scale_norm = scale_norm inputs = InputLayer(X, name='inputs') num_prev = num_input prev_layer = inputs self.layers = [inputs] if type(num_hidden) is types.IntType: lstm = LSTMLayer(num_prev, num_hidden, input_layers=[prev_layer], name="lstm", go_backwards=False) num_prev = num_hidden prev_layer = lstm self.layers.append(prev_layer) prev_layer = DropoutLayer(prev_layer, dropout_prob=dropout_prob) self.layers.append(prev_layer) FC = FullyConnectedLayer(num_prev, num_output, input_layers=[prev_layer], name="yhat") self.layers.append(FC) Y_hat = FC.output() # change to probilities Y_hat = T.nnet.softmax(Y_hat) params = get_params(self.layers) caches = make_caches(params) updates, grads = momentum(loss, params, lr, reg) self.train_func = theano.function([X, Y, lr, reg, dropout_prob, alpha], loss, updates=updates, allow_input_downcast=True) self.predict_sequence_func = theano.function([X, dropout_prob], [Y_hat], allow_input_downcast=True)
def main2(): dnn = DNN(input=28 * 28, layers=[DropoutLayer(160, LQ), Layer(10, LCE)], eta=0.05, lmbda=1) # 98% dnn.initialize_rand() train, test, vadilation = load_mnist_simple() f_names = [f'mnist_expaned_k0{i}.pkl.gz' for i in range(50)] shuffle(f_names) for f_name in f_names: print(f_name) with timing("load"): raw_data = load_data(f_name) with timing("shuffle"): shuffle(raw_data) with timing("reshape"): data = [(x.reshape((784, 1)), y) for x, y in islice(raw_data, 100000)] del raw_data with timing("learn"): dnn.learn(data) del data print('TEST:', dnn.test(test))
def __init__(self, config): self.config = config batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] lib_conv = config['lib_conv'] # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data x = T.ftensor4('x') y = T.ivector('y') rand = T.fvector('rand') print '... building the model' self.layers = [] params = [] weight_types = [] if flag_datalayer: data_layer = DataLayer(input=x, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1_input = data_layer.output else: layer1_input = x convpool_layer1 = ConvPoolLayer(input=layer1_input, image_shape=(3, 227, 227, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, ) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output, image_shape=(96, 27, 27, batch_size), filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, ) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output, image_shape=(256, 13, 13, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096) softmax_layer8 = SoftmaxLayer( input=dropout_layer7.output, n_in=4096, n_out=1000) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.cost = softmax_layer8.negative_log_likelihood(y) self.errors = softmax_layer8.errors(y) self.errors_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params self.x = x self.y = y self.rand = rand self.weight_types = weight_types self.batch_size = batch_size
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): lr = params["lr"] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.matrix(name="input", dtype=dtype) # batch of sequence of vector Y = T.matrix(name="output", dtype=dtype) # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction #CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "same" cnn_batch_size = batch_size pool_size = (2, 2) #Layer1: conv2+pool+drop filter_shape = (128, 1, 10, 10) input_shape = (cnn_batch_size, 1, 144, 176 ) #input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape) c1 = ConvLayer(rng, input, filter_shape, input_shape, border_mode, subsample, activation=nn.relu) p1 = PoolLayer(c1.output, pool_size=pool_size, input_shape=c1.output_shape) dl1 = DropoutLayer(rng, input=p1.output, prob=p_1, is_train=is_train) #Layer2: conv2+pool subsample = (1, 1) filter_shape = (256, p1.output_shape[1], 3, 3) c2 = ConvLayer(rng, dl1.output, filter_shape, p1.output_shape, border_mode, subsample, activation=nn.relu) p2 = PoolLayer(c2.output, pool_size=pool_size, input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape = (256, p2.output_shape[1], 3, 3) c3 = ConvLayer(rng, p2.output, filter_shape, p2.output_shape, border_mode, subsample, activation=nn.relu) p3 = PoolLayer(c3.output, pool_size=pool_size, input_shape=c3.output_shape) #Layer4: conv2+pool filter_shape = (128, p3.output_shape[1], 3, 3) c4 = ConvLayer(rng, p3.output, filter_shape, p3.output_shape, border_mode, subsample, activation=nn.relu) p4 = PoolLayer(c4.output, pool_size=pool_size, input_shape=c4.output_shape) #Layer5: hidden n_in = reduce(lambda x, y: x * y, p4.output_shape[1:]) x_flat = p4.output.flatten(2) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) #Layer6: hidden lreg = LogisticRegression(rng, h1.output, 1024, params['n_output']) self.output = lreg.y_pred self.params = c1.params + c2.params + c3.params + c4.params + h1.params + lreg.params cost = get_err_fn(self, cost_function, Y) L2_reg = 0.0001 L2_sqr = theano.shared(0.) for param in self.params: L2_sqr += (T.sum(param[0]**2) + T.sum(param[1]**2)) cost += L2_reg * L2_sqr _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def __init__(self, config): self.config = config batch_size = config.batch_size lib_conv = config.lib_conv group = (2 if config.grouping else 1) LRN = (True if config.LRN else False) print 'LRN, group', LRN, group # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data x = T.ftensor4('x') y = T.lvector('y') print '... building the model with ConvLib %s, LRN %s, grouping %i ' \ % (lib_conv, LRN, group) self.layers = [] params = [] weight_types = [] layer1_input = x convpool_layer1 = ConvPoolLayer( input=layer1_input, image_shape=((3, 224, 224, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 3, 227, 227)), filter_shape=((3, 11, 11, 96) if lib_conv == 'cudaconvnet' else (96, 3, 11, 11)), convstride=4, padsize=(0 if lib_conv == 'cudaconvnet' else 3), group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=LRN, lib_conv=lib_conv) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer( input=convpool_layer1.output, image_shape=((96, 27, 27, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 96, 27, 27)), filter_shape=((96, 5, 5, 256) if lib_conv == 'cudaconvnet' else (256, 96, 5, 5)), convstride=1, padsize=2, group=group, poolsize=3, poolstride=2, bias_init=0.1, lrn=LRN, lib_conv=lib_conv, ) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer( input=convpool_layer2.output, image_shape=((256, 13, 13, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 256, 13, 13)), filter_shape=((256, 3, 3, 384) if lib_conv == 'cudaconvnet' else (384, 256, 3, 3)), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer( input=convpool_layer3.output, image_shape=((384, 13, 13, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 384, 13, 13)), filter_shape=((384, 3, 3, 384) if lib_conv == 'cudaconvnet' else (384, 384, 3, 3)), convstride=1, padsize=1, group=group, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer( input=convpool_layer4.output, image_shape=((384, 13, 13, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 384, 13, 13)), filter_shape=((384, 3, 3, 256) if lib_conv == 'cudaconvnet' else (256, 384, 3, 3)), convstride=1, padsize=1, group=group, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type if lib_conv == 'cudaconvnet': fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) else: fc_layer6_input = convpool_layer5.output.flatten(2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output) softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=1000) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.cost = softmax_layer8.negative_log_likelihood(y) self.errors = softmax_layer8.errors(y) self.errors_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params self.x = x self.y = y # self.rand = rand self.weight_types = weight_types self.batch_size = batch_size
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] n_lstm=params['n_hidden'] n_out=params['n_output'] batch_size=params["batch_size"] sequence_length=params["seq_length"] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction #CNN global parameters. subsample=(1,1) p_1=0.5 border_mode="valid" cnn_batch_size=batch_size*sequence_length pool_size=(2,2) #Layer1: conv2+pool+drop filter_shape=(64,1,9,9) input_shape=(cnn_batch_size,1,120,60) #input_shape= (samples, channels, rows, cols) input= X.reshape(input_shape) c1=ConvLayer(rng, input,filter_shape, input_shape,border_mode,subsample, activation=nn.relu) p1=PoolLayer(c1.output,pool_size=pool_size,input_shape=c1.output_shape) dl1=DropoutLayer(rng,input=p1.output,prob=p_1,is_train=is_train) #Layer2: conv2+pool filter_shape=(128,p1.output_shape[1],3,3) c2=ConvLayer(rng, dl1.output, filter_shape,p1.output_shape,border_mode,subsample, activation=nn.relu) p2=PoolLayer(c2.output,pool_size=pool_size,input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape=(128,p2.output_shape[1],3,3) c3=ConvLayer(rng, p2.output,filter_shape,p2.output_shape,border_mode,subsample, activation=nn.relu) p3=PoolLayer(c3.output,pool_size=pool_size,input_shape=c3.output_shape) #Layer4: hidden n_in= reduce(lambda x, y: x*y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1=HiddenLayer(rng,x_flat,n_in,1024,activation=nn.relu) n_in=1024 rnn_input = h1.output.reshape((batch_size,sequence_length, n_in)) #Layer5: LSTM self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) self.params = layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,h_tm1,c_tm1): [h_t,c_t,y_t]=layer1.run(x_t,h_tm1,c_tm1) y = T.dot(y_t, self.W_hy) + self.b_y return [h_t,c_t,y] H = T.matrix(name="H",dtype=dtype) # initial hidden state C = T.matrix(name="C",dtype=dtype) # initial hidden state [h_t,c_t,y_vals], _ = theano.scan(fn=step_lstm, sequences=[rnn_input.dimshuffle(1,0,2)], outputs_info=[H, C, None]) self.output = y_vals.dimshuffle(1,0,2) self.params =c1.params+c2.params+c3.params+h1.params+self.params cost=get_err_fn(self,cost_function,Y) L2_reg=0.0001 L2_sqr = theano.shared(0.) for param in self.params: L2_sqr += (T.sum(param ** 2)) cost += L2_reg*L2_sqr _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X,Y,is_train,H,C],outputs=[cost,h_t[-1],c_t[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,H,C], outputs = [self.output,h_t[-1],c_t[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): lr = params["lr"] n_lstm = params['n_hidden'] n_out = params['n_output'] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction #CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "valid" cnn_batch_size = batch_size * sequence_length pool_size = (2, 2) #Layer1: conv2+pool+drop filter_shape = (64, 1, 9, 9) input_shape = (cnn_batch_size, 1, 120, 60 ) #input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape) c1 = ConvLayer(rng, input, filter_shape, input_shape, border_mode, subsample, activation=nn.relu) p1 = PoolLayer(c1.output, pool_size=pool_size, input_shape=c1.output_shape) dl1 = DropoutLayer(rng, input=p1.output, prob=p_1, is_train=is_train) retain_prob = 1. - p_1 test_output = p1.output * retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape = (128, p1.output_shape[1], 3, 3) c2 = ConvLayer(rng, d1_output, filter_shape, p1.output_shape, border_mode, subsample, activation=nn.relu) p2 = PoolLayer(c2.output, pool_size=pool_size, input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape = (128, p2.output_shape[1], 3, 3) c3 = ConvLayer(rng, p2.output, filter_shape, p2.output_shape, border_mode, subsample, activation=nn.relu) p3 = PoolLayer(c3.output, pool_size=pool_size, input_shape=c3.output_shape) #Layer4: hidden n_in = reduce(lambda x, y: x * y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) n_in = 1024 rnn_input = h1.output.reshape((batch_size, sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') layer1 = LSTMLayer(rng, 0, self.n_in, self.n_lstm) layer2 = LSTMLayer(rng, 1, self.n_lstm, self.n_lstm) layer3 = LSTMLayer(rng, 2, self.n_lstm, self.n_lstm) self.params = layer1.params + layer2.params + layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t, mask, h_tm1_1, c_tm1_1, h_tm1_2, c_tm1_2, h_tm1_3, c_tm1_3): [h_t_1, c_t_1, y_t_1] = layer1.run(x_t, h_tm1_1, c_tm1_1) dl1 = DropoutLayer(rng, input=y_t_1, prob=0.5, is_train=is_train, mask=mask) [h_t_2, c_t_2, y_t_2] = layer2.run(dl1.output, h_tm1_2, c_tm1_2) [h_t_3, c_t_3, y_t_3] = layer3.run(y_t_2, h_tm1_3, c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y] h0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state h0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state h0_3 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_3 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state mask_shape = (sequence_length, batch_size, self.n_lstm) p_1 = 0.5 mask = rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y_vals], _ = theano.scan( fn=step_lstm, sequences=[rnn_input.dimshuffle(1, 0, 2), mask], outputs_info=[h0_1, c0_1, h0_2, c0_2, h0_3, c0_3, None]) self.output = y_vals.dimshuffle(1, 0, 2) self.params = c1.params + c2.params + c3.params + h1.params + self.params cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def __init__(self, config, testMode): self.config = config batch_size = config['batch_size'] lib_conv = config['lib_conv'] useLayers = config['useLayers'] #imgWidth = config['imgWidth'] #imgHeight = config['imgHeight'] initWeights = config['initWeights'] #if we wish to initialize alexnet with some weights. #need to make changes in layers.py to accept initilizing weights if initWeights: weightsDir = config['weightsDir'] weightFileTag = config['weightFileTag'] prob_drop = config['prob_drop'] # ##################### BUILD NETWORK ########################## x = T.ftensor4('x') mean = T.ftensor4('mean') #y = T.lvector('y') print '... building the model' self.layers = [] params = [] weight_types = [] if useLayers >= 1: convpool_layer1 = ConvPoolLayer(input=x-mean, image_shape=(3, None, None, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_0'+weightFileTag, 'b_0'+weightFileTag] ) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type if useLayers >= 2: convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output, image_shape=(96, None, None, batch_size), #change from 27 to appropriate value sbased on conv1's output filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W0_1'+weightFileTag, 'W1_1'+weightFileTag, 'b0_1'+weightFileTag, 'b1_1'+weightFileTag] ) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type if useLayers >= 3: convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output, image_shape=(256, None, None, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_2'+weightFileTag, 'b_2'+weightFileTag] ) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type if useLayers >= 4: convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output, image_shape=(384, None, None, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W0_3'+weightFileTag, 'W1_3'+weightFileTag, 'b0_3'+weightFileTag, 'b1_3'+weightFileTag] ) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type if useLayers >= 5: convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output, image_shape=(384, None, None, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W0_4'+weightFileTag, 'W1_4'+weightFileTag, 'b0_4'+weightFileTag, 'b1_4'+weightFileTag] ) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type if useLayers >= 6: fc_layer6_input = T.flatten(convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_5'+weightFileTag, 'b_5'+weightFileTag]) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type if testMode: dropout_layer6 = fc_layer6 else: dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096, prob_drop=prob_drop) if useLayers >= 7: fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_6'+weightFileTag, 'b_6'+weightFileTag]) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type if testMode: dropout_layer6 = fc_layer7 else: dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096, prob_drop=prob_drop) if useLayers >= 8: softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=1000, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_7'+weightFileTag, 'b_7'+weightFileTag]) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.output = self.layers[useLayers-1] self.params = params self.x = x self.mean = mean self.weight_types = weight_types self.batch_size = batch_size self.useLayers = useLayers self.outLayer = self.layers[useLayers-1] meanVal = np.load(config['mean_file']) meanVal = meanVal[:, :, :, np.newaxis].astype('float32') #x is 4d, with 'batch' number of images. meanVal has only '1' in the 'batch' dimension. subtraction wont work. meanVal = np.tile(meanVal,(1,1,1,batch_size)) self.meanVal = meanVal #meanVal = np.zeros([3,imgHeight,imgWidth,2], dtype='float32') if useLayers >= 8: #if last layer is softmax, then its output is y_pred finalOut = self.outLayer.y_pred else: finalOut = self.outLayer.output self.forwardFunction = theano.function([self.x, In(self.mean, value=meanVal)], [finalOut])
BranchedLayer([None, ConvLayer(64, [1, 7])]), BranchedLayer([None, ConvLayer(96, 3, padding='valid')]), MergeLayer(axis=3), BranchedLayer([ ConvLayer(192, 3, strides=2, padding='valid'), MaxPoolLayer(3, strides=2, padding='valid') ]), MergeLayer(axis=3), *([inception_a] * args.na), # x4 ConvLayer(1024, 3, strides=2), # reduction_a *([inception_b] * args.nb), # x7 ConvLayer(1536, 3, strides=2), # reduction_b *([inception_c] * args.nc), # x3 GlobalAvgPoolLayer(), FlattenLayer(), DropoutLayer(rate=args.drop_prob) ] data_params = { 'na': args.na, 'nb': args.nb, 'nc': args.nc, 'batch_norm': batch_norm, 'drop_prob': args.drop_prob, 'augmentation': True } cnn = CNN(layers, n_classes=n_classes, batch_size=128, l2_lambda=args.l2_lambda,
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None, None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed(HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim*pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) pix_out = tf.unpack(pix_out, axis=1) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unpack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) emb_out = tf.unpack(emb_out) else: emb_out = emb_set[0] rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def __init__(self, config): self.config = config batch_size = config['batch_size'] batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] lib_conv = config['lib_conv'] layers = [] params = [] weight_types = [] # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data x1 = T.ftensor4('x1') x2 = T.ftensor4('x2') y = T.lvector('y') # The ground truth to be compared with will go here rand1 = T.fvector('rand1') rand2 = T.fvector('rand2') print '... building the model' if flag_datalayer: data_layerA = DataLayer(input=x1, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1A_input = data_layerA.output else: layer1A_input = x1 if flag_datalayer: data_layerB = DataLayer(input=x2, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1B_input = data_layerB.output else: layer1B_input = x2 fc_layer2_input = T.concatenate( (T.flatten(layer1A_input.dimshuffle(3, 0, 1, 2), 2), T.flatten(layer1B_input.dimshuffle(3, 0, 1, 2), 2)), axis=1) fc_layer2 = FCLayer(input=fc_layer2_input, n_in=154587 * 2, n_out=4096) layers.append(fc_layer2) params += fc_layer2.params weight_types += fc_layer2.weight_type dropout_layer2 = DropoutLayer(fc_layer2.output, n_in=4096, n_out=4096) fc_layer3 = FCLayer(input=dropout_layer2.output, n_in=4096, n_out=4096) layers.append(fc_layer3) params += fc_layer3.params weight_types += fc_layer3.weight_type dropout_layer3 = DropoutLayer(fc_layer3.output, n_in=4096, n_out=4096) # Final softmax layer softmax_layer3 = SoftmaxLayer( input=dropout_layer3.output, n_in=4096, n_out=2) # Only a single binary output is required! layers.append(softmax_layer3) params += softmax_layer3.params weight_types += softmax_layer3.weight_type # #################### NETWORK BUILT ####################### self.cost = softmax_layer3.negative_log_likelihood(y) self.errors = softmax_layer3.errors(y) self.errors_top_5 = softmax_layer3.errors_top_x(y, 5) self.x1 = x1 self.x2 = x2 self.y = y self.rand1 = rand1 self.rand2 = rand2 self.layers = layers self.params = params self.weight_types = weight_types self.batch_size = batch_size
def __init__(self, input_size=(1, 28, 28), activation_type=ActivationType.ReLU, hidden_size=50, output_size=10): # basic paramters self.__activation_type__ = activation_type self.params = {} self.layers = [] # set layers channel_num = input_size[0] for i, param in enumerate([ { 'filter_num': 16, 'filter_size': 3, 'pad': 1, 'stride': 1 }, { 'filter_num': 16, 'filter_size': 3, 'pad': 1, 'stride': 1 }, { 'filter_num': 32, 'filter_size': 3, 'pad': 1, 'stride': 1 }, { 'filter_num': 32, 'filter_size': 3, 'pad': 2, 'stride': 1 }, { 'filter_num': 64, 'filter_size': 3, 'pad': 1, 'stride': 1 }, { 'filter_num': 64, 'filter_size': 3, 'pad': 1, 'stride': 1 }, { 'pre_node_num': 64 * 4 * 4, 'next_node_num': hidden_size }, { 'pre_node_num': hidden_size, 'next_node_num': output_size }, ]): # layer 1 ~ 6 Convolution Layer & ReLU Layer if i + 1 in range(1, 7): # create convolution layer convolution_layer = ConvolutionLayer( index=i + 1, activation_type=self.__activation_type__, filter_num=param['filter_num'], channel_num=channel_num, filter_size=param['filter_size'], stride=param['stride'], padding=param['pad']) self.layers.append(convolution_layer) self.layers.append( self.activationLayerFromType(activation_type, index=i + 1)) # layer 2, 4, 6 Pooling Layer if i + 1 in (2, 4, 6): self.layers.append( PoolingLayer(index=i + 1, pool_h=2, pool_w=2, stride=2)) # update next channel num channel_num = convolution_layer.filter_num layer = convolution_layer # layer 7, 8 Hidden Layer & ReLU Layer & Dropout Layer if i + 1 in (7, 8): hidden_layer = HiddenLayer( index=i + 1, activation_type=self.__activation_type__, pre_node_num=param['pre_node_num'], next_node_num=param['next_node_num']) self.layers.append(hidden_layer) if i + 1 == 7: self.layers.append( self.activationLayerFromType(activation_type, index=i + 1)) self.layers.append(DropoutLayer(index=i + 1, dropout_ratio=0.5)) layer = hidden_layer # set W,b self.params['W{}'.format(i + 1)] = layer.W self.params['b{}'.format(i + 1)] = layer.b print('layer {} created'.format(i + 1)) if Config.IS_DEBUG: print('W{} shape : {}'.format( i + 1, self.params['W{}'.format(i + 1)].shape)) print('b{} shape : {}'.format( i + 1, self.params['W{}'.format(i + 1)].shape)) # output created layer structures for layer in self.layers: print(layer.name) # keep weight required layer indexes self.weight_layer_indexes = [] for j, layer in enumerate(self.layers): if isinstance(layer, (ConvolutionLayer, HiddenLayer)): self.weight_layer_indexes.append(j) self.debug('weight_layer_indexes {}'.format(self.weight_layer_indexes)) print('{} layers created'.format(len(self.layers))) # last layer SoftmaxWithLoss Layer self.lastLayer = SoftmaxWithLossLayer()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ngram_embedding=None, pixels=None, con_width=None, filters=None, pooling_size=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param rad_dim: n :param emb: :param ngram_embedding: 预训练 ngram embeddig 文件 :param pixels: :param con_width: :param filters: :param pooling_size: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'graphic': self.graphic, 'pic_size': self.pic_size, 'word_vec': self.word_vec, 'radical': self.radical, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'filter_size': con_width, 'filters': filters, 'pooling_size': pooling_size, 'font': self.font, 'buckets_char': self.buckets_char, 'ngram': self.ngram} print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim print "Ngram embedding dimension is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open(trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') # 偏旁部首向量 # 依照《康熙字典》,共有 214 个偏旁部首。 # 只用了常见汉字的偏旁部首,非常见汉字和非汉字的偏旁部首用其他两个特殊符号代替, # 所以共有 216 个偏旁部首 if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = \ None, None, None, None, None, None if self.graphic: # 使用图像信息,需要用到 CNN self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed( HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 output_wrapper = TimeDistributed( HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] if self.word_vec: # 根据 one-hot 向量查找对应的字向量 # word_out: shape=(batch_size, 句子长度,字向量维度(64)) word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.radical: # 嵌入偏旁部首信息,shape = (batch_size, 句子长度) input_radicals = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_radicals) radical_out = self.radical_layer(input_radicals) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unstack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if self.window_size > 1: padding_size = int(np.floor(self.window_size / 2)) word_padded = tf.pad(word_out, [[0, 0], [padding_size, padding_size], [0, 0]], 'CONSTANT') Ws = [] for q in range(1, self.window_size + 1): Ws.append(tf.get_variable("W_%d" % q, shape=[q * emb_dim, self.filters_number])) b = tf.get_variable("b", shape=[self.filters_number]) z = [None for _ in range(0, bucket)] for q in range(1, self.window_size + 1): for i in range(padding_size, bucket + padding_size): low = i - int(np.floor((q - 1) / 2)) high = i + int(np.ceil((q + 1) / 2)) x = word_padded[:, low, :] for j in range(low + 1, high): x = tf.concat(values=[x, word_padded[:, j, :]], axis=1) z_iq = tf.tanh(tf.nn.xw_plus_b(x, Ws[q - 1], b)) if z[i - padding_size] is None: z[i - padding_size] = z_iq else: z[i - padding_size] = tf.concat([z[i - padding_size], z_iq], axis=1) z = tf.stack(z, axis=1) values, indices = tf.nn.top_k(z, sorted=False, k=emb_dim) # highway layer X = tf.unstack(word_out, axis=1) Conv_X = tf.unstack(values, axis=1) X_hat = [] W_t = tf.get_variable("W_t", shape=[emb_dim, emb_dim]) b_t = tf.get_variable("b_t", shape=[emb_dim]) for x, conv_x in zip(X, Conv_X): T_x = tf.sigmoid(tf.nn.xw_plus_b(x, W_t, b_t)) X_hat.append(tf.multiply(conv_x, T_x) + tf.multiply(x, 1 - T_x)) X_hat = tf.stack(X_hat, axis=1) emb_set.append(X_hat) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(self.highway(emb_out, "tag"), input_sentences) # 应用全连接层,Wx+b 得到最后的输出 output = output_wrapper(rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx # language model lm_rnn_dim = rnn_dim with tf.variable_scope('LM-BiRNN'): if gru: lm_fw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) lm_bw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) else: lm_fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) if rnn_num > 1: lm_fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_fw_rnn_cell] * rnn_num, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_bw_rnn_cell] * rnn_num, state_is_tuple=True) lm_rnn_output = BiLSTM(lm_rnn_dim, fw_cell=lm_fw_rnn_cell, bw_cell=lm_bw_rnn_cell, p=dr, name='LM-BiLSTM' + str(bucket), scope='LM-BiRNN')(self.highway(emb_set[0]), input_sentences) lm_output_wrapper = TimeDistributed( HiddenLayer(lm_rnn_dim * 2, self.nums_chars + 2, activation='linear', name='lm_hidden'), name='lm_wrapper') lm_final_output = lm_output_wrapper(lm_rnn_output) self.lm_predictions.append([lm_final_output]) self.lm_groundtruthes.append([tf.placeholder(tf.int32, [None, bucket], name='lm_targets' + str(bucket))]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.lm_predictions) == len(self.lm_groundtruthes) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def image_repr(self, x, rand, config): batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] lib_conv = config['lib_conv'] layers = [] params = [] weight_types = [] if flag_datalayer: data_layer = DataLayer(input=x, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1_input = data_layer.output else: layer1_input = x convpool_layer1 = ConvPoolLayer( input=layer1_input, image_shape=(3, 227, 227, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, ) layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer( input=convpool_layer1.output, image_shape=(96, 27, 27, batch_size), filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, ) layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer( input=convpool_layer2.output, image_shape=(256, 13, 13, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer( input=convpool_layer3.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, ) layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer( input=convpool_layer4.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = MaxoutLayer(input=fc_layer6_input, n_in=9216, n_out=4096) layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096) fc_layer7 = MaxoutLayer(input=dropout_layer6.output, n_in=4096, n_out=4096) layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type #dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096) # Rename weight types so that weights can be shared new_weight_types = [] counter_W = 0 counter_b = 0 for w in weight_types: if w == 'W': new_weight_types.append('W' + str(counter_W)) counter_W += 1 elif w == 'b': new_weight_types.append('b' + str(counter_b)) counter_b += 1 weight_types = new_weight_types return fc_layer7, layers, params, weight_types
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): lr = params["lr"] n_lstm = params['n_hidden'] n_out = params['n_output'] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction #CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "valid" cnn_batch_size = batch_size * sequence_length pool_size = (2, 2) #Layer1: conv2+pool+drop filter_shape = (64, 1, 9, 9) input_shape = (cnn_batch_size, 1, 120, 60 ) #input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape) c1 = ConvLayer(rng, input, filter_shape, input_shape, border_mode, subsample, activation=nn.relu) p1 = PoolLayer(c1.output, pool_size=pool_size, input_shape=c1.output_shape) dl1 = DropoutLayer(rng, input=p1.output, prob=p_1) retain_prob = 1. - p_1 test_output = p1.output * retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape = (128, p1.output_shape[1], 3, 3) c2 = ConvLayer(rng, d1_output, filter_shape, p1.output_shape, border_mode, subsample, activation=nn.relu) p2 = PoolLayer(c2.output, pool_size=pool_size, input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape = (128, p2.output_shape[1], 3, 3) c3 = ConvLayer(rng, p2.output, filter_shape, p2.output_shape, border_mode, subsample, activation=nn.relu) p3 = PoolLayer(c3.output, pool_size=pool_size, input_shape=c3.output_shape) #Layer4: hidden n_in = reduce(lambda x, y: x * y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) n_in = 1024 rnn_input = h1.output.reshape((batch_size, sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xi', sample='glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hr', sample='glorot') self.b_r = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xz', sample='glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hz', sample='glorot') self.b_z = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xh', sample='glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hh', sample='glorot') self.b_h = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') self.params = [ self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y ] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid( T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid( T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh( T.dot(x_t, self.W_xh) + T.dot((r_t * h_tm1), self.W_hh) + self.b_h) hh_t = z_t * h_t + (1 - z_t) * h_tm1 y_t = T.dot(hh_t, self.W_hy) + self.b_y return [hh_t, y_t] h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=rnn_input.dimshuffle(1, 0, 2), outputs_info=[h0, None]) self.output = y_vals.dimshuffle(1, 0, 2) self.params = c1.params + c2.params + c3.params + h1.params + self.params cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme} #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) else: emb_out = emb_set[0] emb_out = DropoutLayer(dr)(emb_out) emb_out = tf.unpack(emb_out) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
model.add_layer( Convolution(32, (3, 3), input_shape=(batch_size, X_tr.shape[1], X_tr.shape[2], X_tr.shape[3]), weight_initializer=NormalInitializer(std))) model.add_layer(ReLuActivation()) model.add_layer(BatchNormalization()) model.add_layer( Convolution(32, (3, 3), weight_initializer=NormalInitializer(std), padding='same')) model.add_layer(ReLuActivation()) model.add_layer(MaxPool((2, 2))) model.add_layer(Flatten()) model.add_layer( Affine(100, weight_initializer=NormalInitializer(std), reg=reg)) model.add_layer(ReLuActivation()) model.add_layer(DropoutLayer(drop_rate=0.3)) model.add_layer( Affine(n_classes, weight_initializer=NormalInitializer(std), reg=reg)) model.initialize(loss=CrossEntropyLoss(), optimizer=Adam(learning_rate=0.001, decay_fst_mom=0.9, decay_sec_mom=0.999)) # with open('model_90_49.14262959724404', 'rb') as file: # model = pickle.load(file) model.fit(batch_size, X_tr, y_tr, n_epochs=100, metric=accuracy_metric)
def __init__(self, config): ModelBase.__init__(self) self.config = config self.verbose = self.config['verbose'] self.name = 'alexnet' batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] lib_conv = config['lib_conv'] n_softmax_out = config['n_softmax_out'] # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data x = T.ftensor4('x') y = T.lvector('y') rand = T.fvector('rand') lr = T.scalar('lr') if self.verbose: print 'AlexNet 2/16' self.layers = [] params = [] weight_types = [] if flag_datalayer: data_layer = DataLayer(input=x, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1_input = data_layer.output else: layer1_input = x convpool_layer1 = ConvPoolLayer(input=layer1_input, image_shape=(3, 227, 227, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output, image_shape=(96, 27, 27, batch_size), filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output, image_shape=(256, 13, 13, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096, verbose=self.verbose) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096, verbose=self.verbose) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096, verbose=self.verbose) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096, verbose=self.verbose) softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=n_softmax_out, verbose=self.verbose) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.p_y_given_x = softmax_layer8.p_y_given_x self.y_pred = softmax_layer8.y_pred self.output = self.p_y_given_x self.cost = softmax_layer8.negative_log_likelihood(y) self.error = softmax_layer8.errors(y) if n_softmax_out < 5: self.error_top_5 = softmax_layer8.errors_top_x(y, n_softmax_out) else: self.error_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params # inputs self.x = x self.y = y self.rand = rand self.lr = lr self.shared_x = theano.shared( np.zeros( (3, config['input_width'], config['input_height'], config['file_batch_size']), # for loading large batch dtype=theano.config.floatX), borrow=True) self.shared_y = theano.shared(np.zeros((config['file_batch_size'], ), dtype=int), borrow=True) self.shared_lr = theano.shared(np.float32(config['learning_rate'])) # training related self.base_lr = np.float32(config['learning_rate']) self.step_idx = 0 self.mu = config['momentum'] # def: 0.9 # momentum self.eta = config['weight_decay'] #0.0002 # weight decay self.weight_types = weight_types self.batch_size = batch_size self.grads = T.grad(self.cost, self.params) subb_ind = T.iscalar('subb') # sub batch index #print self.shared_x[:,:,:,subb_ind*self.batch_size:(subb_ind+1)*self.batch_size].shape.eval() self.subb_ind = subb_ind self.shared_x_slice = self.shared_x[:, :, :, subb_ind * self.batch_size:(subb_ind + 1) * self.batch_size] self.shared_y_slice = self.shared_y[subb_ind * self.batch_size:(subb_ind + 1) * self.batch_size]
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, fnn_dim, window_size, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['fnn_dim'] = fnn_dim param_dic['window_size'] = window_size param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram param_dic['mode'] = self.mode #print param_dic if self.metric == 'All': pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open( trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out #concat_emb_dim = emb_dim * 2 concat_emb_dim = 0 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') concat_emb_dim += emb_dim if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') concat_emb_dim += rad_dim if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name=str(i + 2) + 'gram_layer')) concat_emb_dim += emb_dim wrapper_conv_1, wrapper_mp_1, wrapper_conv_2 = None, None, None wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = Convolution(con_width, 1, filters, name='conv_1') wrapper_mp_1 = Maxpooling(pooling_size, pooling_size, name='pooling_1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = Convolution(con_width, filters, filters, name='conv_2') wrapper_mp_2 = Maxpooling(pooling_size, pooling_size, name='pooling_2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense') wrapper_dr = DropoutLayer(self.drop_out) concat_emb_dim += 100 fw_rnn_cell, bw_rnn_cell = None, None if self.mode == 'RNN': with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell( [fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell( [bw_rnn_cell] * rnn_num, state_is_tuple=True) output_wrapper = HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='out_wrapper') fnn_weights, fnn_bias = None, None else: with tf.variable_scope('FNN'): fnn_weights = tf.get_variable( 'conv_w', [2 * window_size + 1, concat_emb_dim, 1, fnn_dim]) fnn_bias = tf.get_variable( 'conv_b', [fnn_dim], initializer=tf.constant_initializer(0.1)) output_wrapper = HiddenLayer(fnn_dim, self.nums_tags[0], activation='linear', name='out_wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape( pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] if self.mode == 'RNN': rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) else: emb_out = tf.pad(emb_out, [[0, 0], [window_size, window_size], [0, 0]]) emb_out = tf.reshape( emb_out, [-1, bucket + 2 * window_size, concat_emb_dim, 1]) conv_out = tf.nn.conv2d(emb_out, fnn_weights, [1, 1, 1, 1], padding='VALID') + fnn_bias fnn_out = tf.nn.tanh(conv_out) fnn_out = tf.reshape(fnn_out, [-1, bucket, fnn_dim]) output = output_wrapper(fnn_out) self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) \ and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, cell, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = { 'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'cell': cell, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme } #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables batch_size_h = tf.placeholder(tf.int32, [], name='batch_size_holder') dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.batch_size_h = batch_size_h self.drop_out = dr self.drop_out_v = drop_out # pdb.set_trace() self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name=str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if cell == 'gru': fw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim) #forward bw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim) #backward else: fw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.contrib.rnn.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.contrib.rnn.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) output_wrapper = HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() batch_size = self.real_batches[idx] input_v1 = tf.placeholder(tf.int32, [None, bucket], name='input_1' + str(bucket)) input_v2 = tf.placeholder(tf.int32, [None, bucket], name='input_2' + str(bucket)) self.input_v1.append([input_v1]) self.input_v2.append([input_v2]) #output = None output = [] for i in range(self.num_gpus): with tf.device('/gpu:{}'.format(i)): input_1 = input_v1[i * batch_size_h:(i + 1) * batch_size_h] input_2 = input_v2[i * batch_size_h:(i + 1) * batch_size_h] emb_set1 = [] emb_set2 = [] word_out1 = self.emb_layer(input_1) word_out2 = self.emb_layer(input_2) emb_set1.append(word_out1) emb_set2.append(word_out2) # if self.ngram is not None: # for i in range(len(self.ngram)): # input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) # self.input_v[-1].append(input_g) # gram_out = self.gram_layers[i](input_g) # emb_set.append(gram_out) if len(emb_set1) > 1: emb_out1 = tf.concat(axis=2, values=emb_set1) emb_out2 = tf.concat(axis=2, values=emb_set2) else: emb_out1 = emb_set1[0] emb_out2 = emb_set2[0] emb_out1 = DropoutLayer(dr)(emb_out1) emb_out2 = DropoutLayer(dr)(emb_out2) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out1, emb_out2, input_v1) output_g = output_wrapper(rnn_out) # if output == None: # output = output_g # else: # output = tf.concat([output,output_g],axis = 0) #pdb.set_trace() output.append(output_g) self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket - 1], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v1) == len(self.output) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def __init__(self, input, n_in=28**2, n_hidden_1=1024, n_hidden_2=1024, n_hidden_3=1024, n_hidden_4=1024, n_out=10, W_hidden_1=None, W_hidden_2=None, W_hidden_3=None, W_hidden_4=None, W_out=None, dropout=0.0, seed=None): relu_activation = lambda x: T.nnet.relu(x, 0.1) # relu_activation = T.nnet.relu seed = np.random.randint(int(1e5)) if seed is None else seed self.dropout_layer_1 = DropoutLayer(input=input, seed=seed, dropout=dropout) self.hidden_1 = HiddenLayer( seed=seed + 1, # input=input, input=self.dropout_layer_1.output, # input=self.dropout_layer.output, n_in=n_in, n_out=n_hidden_1, activation=relu_activation, W=W_hidden_1, ) self.dropout_layer_2 = DropoutLayer(input=self.hidden_1.output, seed=seed + 2, dropout=dropout) self.hidden_2 = HiddenLayer( seed=seed + 3, # input=self.hidden_1.output, input=self.dropout_layer_2.output, n_in=n_hidden_1, n_out=n_hidden_2, activation=relu_activation, W=W_hidden_2) self.dropout_layer_3 = DropoutLayer(input=self.hidden_2.output, seed=seed + 4, dropout=dropout) self.hidden_3 = HiddenLayer(seed=seed + 5, input=self.dropout_layer_3.output, n_in=n_hidden_2, n_out=n_hidden_3, activation=relu_activation, W=W_hidden_3) self.dropout_layer_4 = DropoutLayer(input=self.hidden_3.output, seed=seed + 6, dropout=dropout) self.hidden_4 = HiddenLayer(seed=seed + 7, input=self.dropout_layer_4.output, n_in=n_hidden_3, n_out=n_hidden_4, activation=relu_activation, W=W_hidden_4) self.dropout_layer_5 = DropoutLayer(input=self.hidden_4.output, seed=seed + 8, dropout=dropout) self.linear_layer = HiddenLayer( seed=seed + 9, # input=self.hidden_1.output, # input=self.hidden_2.output, input=self.dropout_layer_5.output, n_in=n_hidden_4, n_out=n_out, activation=identity_map, W=W_out) self.softmax_layer = SoftmaxLayer(input=self.linear_layer.output) # keep track of model input self.input = input self.p_y_given_x = self.softmax_layer.p_y_given_x self.y_pred = self.softmax_layer.y_pred self.L1 = (abs(self.hidden_1.W).sum() + abs(self.hidden_2.W).sum() + abs(self.hidden_3.W).sum() + abs(self.hidden_4.W).sum() + abs(self.linear_layer.W).sum()) self.L2_sqr = (T.sum(self.hidden_1.W**2) + T.sum(self.hidden_2.W**2) + T.sum(self.hidden_3.W**2) + T.sum(self.hidden_4.W**2) + T.sum(self.linear_layer.W**2)) self.mean_log_likelihood = (self.softmax_layer.mean_log_likelihood) self.errors = self.softmax_layer.errors self.params = (self.hidden_1.params + self.hidden_2.params + self.hidden_3.params + self.hidden_4.params + self.linear_layer.params)