def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [T.clip(g, -10, 10) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=T.sum(cross_entropy), updates=updates.rmsprop(params, grads)) return P, train
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-4*l2 grads = [ T.clip(g,-10,10) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(params,grads,learning_rate = 1e-5) ) return P,train
def make_batch_train(P, cost, end_id): batch = T.imatrix('batch') costs, disp_costs = cost(batch, P) batch_cost = T.mean(costs) print "Calculating gradient..." params = P.values() grads = T.grad(batch_cost, wrt=params) grads_norms = [T.sqrt(T.sum(g**2)) for g in grads] deltas = [T.switch(T.gt(n, 5), 5 * g / n, g) for n, g in zip(grads_norms, grads)] print "Compiling function..." _train = theano.function( inputs=[batch], outputs=T.mean(disp_costs), updates=updates.rmsprop(params, deltas) ) def train(batch): max_length = max(len(l) for l in batch) batch_array = end_id * np.ones((len(batch), max_length), dtype=np.int32) for i, l in enumerate(batch): batch_array[i, :len(l)] = l return _train(batch_array) print "Done." return train
def turing_updates(cost , lr) : params = P.values() #whether add P weight decay l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() all_cost = cost + 1e-3 * l2 grads = [T.clip(g, -100, 100) for g in T.grad(all_cost, wrt=params)] return updates.rmsprop(params, grads, learning_rate=lr)
def make_functions( input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs,'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:,-output_length:,:-2] Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)"%(time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop( params, grads, learning_rate=1e-4, P=P_learn ) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function( inputs=[input_seqs, output_seqs], outputs=bits_loss ) print "Done. (%0.3f s)"%(time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_functions(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs, 'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:, -output_length:, :-2] Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)" % (time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop(params, grads, learning_rate=1e-4, P=P_learn) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss) print "Done. (%0.3f s)" % (time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) : #initialize model P = Parameters() image_projecting = image_project.build(P, image_size, proj_size) batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size) image_vector = T.vector() #training correct_triplet = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_vector = image_projecting(image_vector) image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0) correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2]) negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2]) correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector) negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix) #margin cost zero_cost = T.zeros_like(negative_cross_dot_vector) margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost) #regulizar cost params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] lr = T.scalar(name='learning rate',dtype='float32') train = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr], outputs=cost, updates=updates.rmsprop(params, grads, learning_rate=lr), allow_input_downcast=True ) #valid valid = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]], outputs=cost, allow_input_downcast=True ) #visualize image_project_fun = theano.function( inputs=[image_vector], outputs=image_projection_vector, allow_input_downcast=True ) #testing all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0) all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2]) all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix) test = theano.function( inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]], outputs=all_cross_dot_vector, allow_input_downcast=True ) return P , train , valid , image_project_fun , test
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) : #initialize model P = Parameters() image_projecting = image_project.build(P, image_size, proj_size) batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size) image_vector = T.vector() #training correct_triplet = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_vector = image_projecting(image_vector) image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0) correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2]) negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2]) correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector) negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix) #margin cost zero_cost = T.zeros_like(negative_cross_dot_vector) margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost) #regulizar cost params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] lr = T.scalar(name='learning rate',dtype='float32') train = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr], outputs=cost, updates=updates.rmsprop(params, grads, learning_rate=lr), allow_input_downcast=True ) #valid valid = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]], outputs=cost, allow_input_downcast=True ) #testing all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0) all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2]) all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix) test = theano.function( inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]], outputs=all_cross_dot_vector, allow_input_downcast=True ) #default P_default = Parameters() P_default['left'] = 2 * (np.random.rand(word_size) - 0.5) P_default['right'] = 2 * (np.random.rand(word_size) - 0.5) P_default['relation'] = 2 * (np.random.rand(word_size) - 0.5) correct_triplet_d = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] correct_triplet_d_train = [correct_triplet_d,correct_triplet_d,correct_triplet_d] negative_triplet_d_train = [negative_triplet_d,negative_triplet_d,negative_triplet_d] cost = 0 for i in range(3) : if i == 0 : correct_triplet_d_train[0] = [correct_triplet_d[0],P_default['relation'],P_default['right']] negative_triplet_d_train[0] = [negative_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0)] elif i == 1 : correct_triplet_d_train[1] = [P_default['left'],correct_triplet_d[1],P_default['right']] negative_triplet_d_train[1] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0),negative_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0)] elif i == 2 : correct_triplet_d_train[2] = [P_default['left'],P_default['relation'],correct_triplet_d[2]] negative_triplet_d_train[2] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),negative_triplet_d[2]] image_projection_matrix_d = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet_d[i].shape[0] , axis=0) correct_triplet_encoding_vector_d = vector_triplet_encoding(correct_triplet_d_train[i][0] , correct_triplet_d_train[i][1] , correct_triplet_d_train[i][2]) negative_triplet_encoding_matrix_d = batched_triplet_encoding(negative_triplet_d_train[i][0] , negative_triplet_d_train[i][1] , negative_triplet_d_train[i][2]) correct_cross_dot_scalar_d = T.dot(image_projection_vector , correct_triplet_encoding_vector_d) negative_cross_dot_vector_d = T.batched_dot(image_projection_matrix_d , negative_triplet_encoding_matrix_d) #margin cost zero_cost_d = T.zeros_like(negative_cross_dot_vector_d) margin_cost_d = 1 - correct_cross_dot_scalar_d + negative_cross_dot_vector_d cost_vector_d = T.switch(T.gt(zero_cost_d , margin_cost_d) , zero_cost_d , margin_cost_d) cost = cost + T.sum(cost_vector_d)/T.shape(negative_triplet[i])[0] params_d = P_default.values() l2 = T.sum(0) for p in params_d: l2 = l2 + (p ** 2).sum() cost = cost + 0.01*l2 grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params_d)] train_default = theano.function( inputs=[image_vector, correct_triplet_d[0], correct_triplet_d[1], correct_triplet_d[2], negative_triplet_d[0], negative_triplet_d[1], negative_triplet_d[2], lr], outputs=cost, updates=updates.rmsprop(params_d, grads, learning_rate=lr), allow_input_downcast=True ) all_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] all_triplet_d_test = [all_triplet_d,all_triplet_d,all_triplet_d] result = [[],[],[]] for i in range(3) : image_projection_matrix_test_d = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[i].shape[0] , axis=0) if i == 0 : all_triplet_d_test[0] = [all_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0)] elif i == 1 : all_triplet_d_test[1] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0),all_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0)] elif i == 2 : all_triplet_d_test[2] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),all_triplet_d[2]] all_triplet_encoding_matrix_d = batched_triplet_encoding(all_triplet_d_test[i][0] , all_triplet_d_test[i][1] , all_triplet_d_test[i][2]) result[i] = T.batched_dot(image_projection_matrix_test_d , all_triplet_encoding_matrix_d) test_default = theano.function( inputs=[image_vector, all_triplet_d[0], all_triplet_d[1], all_triplet_d[2]], outputs=result, allow_input_downcast=True ) return P , P_default , train , valid , test , train_default , test_default
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate ,grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build( P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build( P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr,weights,output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output, output_seq),axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate ) ) self.predict_cost = theano.function( inputs=[input_seq,output_seq], outputs= cost ) self.predict = theano.function( inputs=[input_seq], outputs= [ weights, output] )
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate, grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build(P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build(P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr, weights, output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output, output_seq), axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq, output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates=updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate)) self.predict_cost = theano.function(inputs=[input_seq, output_seq], outputs=cost) self.predict = theano.function(inputs=[input_seq], outputs=[weights, output])