def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=T.sum(cross_entropy), updates=updates.adadelta(params, grads)) return P, train
def make_train(input_size,output_size,mem_size,mem_width,hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P,mem_size,mem_width,hidden_size,ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M,weights,output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-3*l2 # clip gradients grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, updates=updates.adadelta(params,grads) ) return P,train
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-4*l2 grads = [ T.clip(g,-10,10) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(params,grads,learning_rate = 1e-5) ) return P,train
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M, weights, output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + 1e-3 * l2 # clip gradients grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=cost, updates=updates.adadelta(params, grads)) return P, train
def prepare_functions(input_size, hidden_size, latent_size, step_count, batch_size, train_X, valid_X): P = Parameters() encode_decode = model.build(P, input_size=input_size, hidden_size=hidden_size, latent_size=latent_size) P.W_decoder_input_0.set_value(P.W_decoder_input_0.get_value() * 10) X = T.matrix('X') step_count = 10 parameters = P.values() cost_symbs = [] for s in xrange(step_count): Z_means, Z_stds, alphas, \ X_mean, log_pi_samples = encode_decode(X, step_count=s + 1) batch_recon_loss, log_p = model.recon_loss(X, X_mean, log_pi_samples) recon_loss = T.mean(batch_recon_loss, axis=0) reg_loss = T.mean(model.reg_loss(Z_means, Z_stds, alphas), axis=0) vlb = recon_loss + reg_loss corr = T.mean(T.eq(T.argmax(log_p, axis=0), T.argmax(log_pi_samples, axis=0)), axis=0) cost = cost_symbs.append(vlb) avg_cost = sum(cost_symbs) / step_count cost = avg_cost + 1e-3 * sum(T.sum(T.sqr(w)) for w in parameters) gradients = updates.clip_deltas(T.grad(cost, wrt=parameters), 5) print "Updated parameters:" pprint(parameters) idx = T.iscalar('idx') train = theano.function( inputs=[idx], outputs=[ vlb, recon_loss, reg_loss, T.max(T.argmax(log_pi_samples, axis=0)), corr ], updates=updates.adam(parameters, gradients, learning_rate=1e-4), givens={X: train_X[idx * batch_size:(idx + 1) * batch_size]}) validate = theano.function(inputs=[], outputs=vlb, givens={X: valid_X}) sample = theano.function(inputs=[], outputs=[ X, X_mean, T.argmax(log_pi_samples, axis=0), T.exp(log_pi_samples) ], givens={X: valid_X[:10]}) return train, validate, sample
def make_functions( input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs,'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:,-output_length:,:-2] Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)"%(time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop( params, grads, learning_rate=1e-4, P=P_learn ) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function( inputs=[input_seqs, output_seqs], outputs=bits_loss ) print "Done. (%0.3f s)"%(time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_functions(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs, 'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:, -output_length:, :-2] Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)" % (time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop(params, grads, learning_rate=1e-4, P=P_learn) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss) print "Done. (%0.3f s)" % (time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X,aux=aux) error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))]) error = error[-(Y.shape[0]/2):] parameters = P.values() gradients = T.grad(T.sum(error),wrt=parameters) shapes = [ p.get_value().shape for p in parameters ] count = theano.shared(np.float32(0)) acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [ (g / count) for g in acc_grads ] avg_grads = [ clip(g,1) for g in acc_grads ] acc = theano.function( inputs=[X,Y], outputs=T.mean(error), updates = acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear ) test = theano.function( inputs=[X], outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):], ) return acc,update,test
def build_network(input_size,hidden_size,constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size,hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X,P.W_input_hidden)+P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden,P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X,output,P) if constraint_adj:pass #cost = cost + adjacency_constraint(hidden_lin) return X,output,cost,P
def build_network(input_size, hidden_size, constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size, hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X, output, P) if constraint_adj: pass #cost = cost + adjacency_constraint(hidden_lin) return X, output, cost, P
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X, aux=aux) error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))]) error = error[-(Y.shape[0] / 2):] parameters = P.values() gradients = T.grad(T.sum(error), wrt=parameters) shapes = [p.get_value().shape for p in parameters] count = theano.shared(np.float32(0)) acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [(g / count) for g in acc_grads] avg_grads = [clip(g, 1) for g in acc_grads] acc = theano.function( inputs=[X, Y], outputs=T.mean(error), updates=acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) + acc_clear) test = theano.function( inputs=[X], outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):], ) return acc, update, test
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] response_length = input_seq.shape[0]/2 train = theano.function( inputs=[input_seq,output_seq], outputs=T.mean(cross_entropy[-response_length:]), updates=updates.adadelta(params,grads) ) return P,train
vocab_size = vocab_size, output_size = vocab_size, map_fun_size = 128, evidence_count = evidence_count ) output_evds,output_ans = attention(story,idxs,qstn) cross_entropy = -T.log(output_ans[ans_lbl]) \ + -T.log(output_evds[0][ans_evds[0]]) \ + -T.log(output_evds[1][ans_evds[1]]) #cost += -T.log(ordered_probs(output_evds,ans_e.vds)) print "Done." print "Parameter count:", P.parameter_count() print "Calculating gradient expression...", params = P.values() cost = cross_entropy grads = T.grad(cost,wrt=params) print "Done." inputs = [story,idxs,qstn,ans_lbl,ans_evds] outputs = cross_entropy pickle.dump( (inputs,outputs,params,grads), open("compute_tree.pkl","wb"),2 ) print "Compiling native...", lr = T.fscalar('lr') acc,update = make_functions(inputs,outputs,params,grads,lr) test = theano.function(
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate, grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build(P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build(P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr, weights, output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output, output_seq), axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq, output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates=updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate)) self.predict_cost = theano.function(inputs=[input_seq, output_seq], outputs=cost) self.predict = theano.function(inputs=[input_seq], outputs=[weights, output])
return T.nnet.categorical_crossentropy(outputs,Y) if __name__ == "__main__": config.parse_args() total_frames = sum(x.shape[0] for x,_ in frame_label_data.training_stream()) logging.info("Total frames: %d"%total_frames) P = Parameters() predict = model.build(P) X = T.matrix('X') Y = T.ivector('Y') _,outputs = predict(X) cross_entropy = T.mean(crossentropy(outputs,Y)) parameters = P.values() loss = cross_entropy + \ (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters) gradients = T.grad(loss,wrt=parameters) logging.info("Parameters to tune:" + ', '.join(sorted(w.name for w in parameters))) update_vars = Parameters() logging.debug("Compiling functions...") chunk_trainer = chunk.build_trainer( inputs=[X,Y], updates = build_updates(parameters,gradients,update_vars) ) validate = validator.build( inputs=[X,Y],
# TODO: fix these magic numbers (especially the 800) def f(X): layer0 = X.reshape((X.shape[0], 1, 28, 28)) layer1 = _build_conv_pool(P, 1, layer0, 20, 1, 5, 2) layer2_= _build_conv_pool(P, 2, layer1, 50, 20, 5, 2) layer2 = layer2_.flatten(2) output = T.nnet.softmax(T.dot(layer2, P.W_hidden_output) + P.b_output) return output return f def cost(P, Y_hat, Y, l2 = 0): return (T.mean(T.nnet.categorical_crossentropy(Y_hat, Y)) + l2 * sum(T.mean(p**2) for p in P.values())) if __name__ == "__main__": import datasets x,y = datasets.mnist() x,y = x[0:1000],y[0:1000] P = Parameters() X = T.matrix('X') Y = T.ivector('Y') net = build(P, 784, 800, 10) Y_hat = net(X) f = theano.function(inputs = [X], outputs = Y_hat) J = cost(P, Y_hat, Y) grad = T.grad(J, wrt=P.values())
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate ,grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build( P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build( P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr,weights,output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output, output_seq),axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate ) ) self.predict_cost = theano.function( inputs=[input_seq,output_seq], outputs= cost ) self.predict = theano.function( inputs=[input_seq], outputs= [ weights, output] )
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) : #initialize model P = Parameters() image_projecting = image_project.build(P, image_size, proj_size) batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size) image_vector = T.vector() #training correct_triplet = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_vector = image_projecting(image_vector) image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0) correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2]) negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2]) correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector) negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix) #margin cost zero_cost = T.zeros_like(negative_cross_dot_vector) margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost) #regulizar cost params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] lr = T.scalar(name='learning rate',dtype='float32') train = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr], outputs=cost, updates=updates.rmsprop(params, grads, learning_rate=lr), allow_input_downcast=True ) #valid valid = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]], outputs=cost, allow_input_downcast=True ) #testing all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0) all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2]) all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix) test = theano.function( inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]], outputs=all_cross_dot_vector, allow_input_downcast=True ) #default P_default = Parameters() P_default['left'] = 2 * (np.random.rand(word_size) - 0.5) P_default['right'] = 2 * (np.random.rand(word_size) - 0.5) P_default['relation'] = 2 * (np.random.rand(word_size) - 0.5) correct_triplet_d = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] correct_triplet_d_train = [correct_triplet_d,correct_triplet_d,correct_triplet_d] negative_triplet_d_train = [negative_triplet_d,negative_triplet_d,negative_triplet_d] cost = 0 for i in range(3) : if i == 0 : correct_triplet_d_train[0] = [correct_triplet_d[0],P_default['relation'],P_default['right']] negative_triplet_d_train[0] = [negative_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0)] elif i == 1 : correct_triplet_d_train[1] = [P_default['left'],correct_triplet_d[1],P_default['right']] negative_triplet_d_train[1] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0),negative_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0)] elif i == 2 : correct_triplet_d_train[2] = [P_default['left'],P_default['relation'],correct_triplet_d[2]] negative_triplet_d_train[2] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),negative_triplet_d[2]] image_projection_matrix_d = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet_d[i].shape[0] , axis=0) correct_triplet_encoding_vector_d = vector_triplet_encoding(correct_triplet_d_train[i][0] , correct_triplet_d_train[i][1] , correct_triplet_d_train[i][2]) negative_triplet_encoding_matrix_d = batched_triplet_encoding(negative_triplet_d_train[i][0] , negative_triplet_d_train[i][1] , negative_triplet_d_train[i][2]) correct_cross_dot_scalar_d = T.dot(image_projection_vector , correct_triplet_encoding_vector_d) negative_cross_dot_vector_d = T.batched_dot(image_projection_matrix_d , negative_triplet_encoding_matrix_d) #margin cost zero_cost_d = T.zeros_like(negative_cross_dot_vector_d) margin_cost_d = 1 - correct_cross_dot_scalar_d + negative_cross_dot_vector_d cost_vector_d = T.switch(T.gt(zero_cost_d , margin_cost_d) , zero_cost_d , margin_cost_d) cost = cost + T.sum(cost_vector_d)/T.shape(negative_triplet[i])[0] params_d = P_default.values() l2 = T.sum(0) for p in params_d: l2 = l2 + (p ** 2).sum() cost = cost + 0.01*l2 grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params_d)] train_default = theano.function( inputs=[image_vector, correct_triplet_d[0], correct_triplet_d[1], correct_triplet_d[2], negative_triplet_d[0], negative_triplet_d[1], negative_triplet_d[2], lr], outputs=cost, updates=updates.rmsprop(params_d, grads, learning_rate=lr), allow_input_downcast=True ) all_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] all_triplet_d_test = [all_triplet_d,all_triplet_d,all_triplet_d] result = [[],[],[]] for i in range(3) : image_projection_matrix_test_d = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[i].shape[0] , axis=0) if i == 0 : all_triplet_d_test[0] = [all_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0)] elif i == 1 : all_triplet_d_test[1] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0),all_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0)] elif i == 2 : all_triplet_d_test[2] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),all_triplet_d[2]] all_triplet_encoding_matrix_d = batched_triplet_encoding(all_triplet_d_test[i][0] , all_triplet_d_test[i][1] , all_triplet_d_test[i][2]) result[i] = T.batched_dot(image_projection_matrix_test_d , all_triplet_encoding_matrix_d) test_default = theano.function( inputs=[image_vector, all_triplet_d[0], all_triplet_d[1], all_triplet_d[2]], outputs=result, allow_input_downcast=True ) return P , P_default , train , valid , test , train_default , test_default
def label_seq(string): idxs = font.indexify(string) return idxs if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') predict = build_model(P, 8, 512, len(font.chars) + 1) probs = predict(X) alpha = 0.5 params = P.values() cost = ctc.cost(probs, Y) #+ 1e-8 * sum(T.sum(T.sqr(w)) for w in params) gradients = T.grad(cost, wrt=params) gradient_acc = [theano.shared(0 * p.get_value()) for p in params] counter = theano.shared(np.float32(0.)) acc = theano.function(inputs=[X, Y], outputs=cost, updates=[(a, a + g) for a, g in zip(gradient_acc, gradients)] + [(counter, counter + np.float32(1.))]) update = theano.function( inputs=[],outputs=[], updates = updates.momentum(params,[ g / counter for g in gradient_acc ]) \ + [ (a, np.float32(0) * a) for a in gradient_acc ] \ + [ (counter,np.float32(0.)) ]
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) : #initialize model P = Parameters() image_projecting = image_project.build(P, image_size, proj_size) batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size) image_vector = T.vector() #training correct_triplet = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_vector = image_projecting(image_vector) image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0) correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2]) negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2]) correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector) negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix) #margin cost zero_cost = T.zeros_like(negative_cross_dot_vector) margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost) #regulizar cost params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] lr = T.scalar(name='learning rate',dtype='float32') train = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr], outputs=cost, updates=updates.rmsprop(params, grads, learning_rate=lr), allow_input_downcast=True ) #valid valid = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]], outputs=cost, allow_input_downcast=True ) #visualize image_project_fun = theano.function( inputs=[image_vector], outputs=image_projection_vector, allow_input_downcast=True ) #testing all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0) all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2]) all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix) test = theano.function( inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]], outputs=all_cross_dot_vector, allow_input_downcast=True ) return P , train , valid , image_project_fun , test
from theano_toolkit.parameters import Parameters from theano_toolkit import updates from pprint import pprint floatX = theano.config.floatX print 'building model' z0 = T.matrix('z0') P = Parameters() iaf, masks = iaf_made_wn(P,L=8,num_units=64, num_hids=1,nonl=T.nnet.elu, cond_bias=False) zT, ss = iaf(z0,cond_bias=None) parameters = P.values() pprint(parameters) logp = U(zT) logq = - ss losses = logq - logp loss = losses.mean() gradients = updates.clip_deltas(T.grad(loss, wrt=parameters), 5) P_train = Parameters() fupdates = updates.adam(parameters, gradients, learning_rate=1e-3, P=P_train) train = theano.function([z0],[loss,logq.mean(),logp.mean()], updates=fupdates) samples = theano.function([z0],zT)