def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M, weights, output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + 1e-3 * l2 # clip gradients grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=cost, updates=updates.adadelta(params, grads)) return P, train
def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=T.sum(cross_entropy), updates=updates.adadelta(params, grads)) return P, train
def create_model(ids,vocab2id,size): word_vector_size = size hidden_state_size = size P = Parameters() P.V = create_vocab_vectors(P,vocab2id,word_vector_size) P.W_predict = np.zeros(P.V.get_value().shape).T P.b_predict = np.zeros((P.V.get_value().shape[0],)) X = P.V[ids] step = build_lstm_step(P,word_vector_size,hidden_state_size) [states,_],_ = theano.scan( step, sequences = [X], outputs_info = [P.init_h,P.init_c] ) scores = T.dot(states,P.W_predict) + P.b_predict scores = T.nnet.softmax(scores) log_likelihood, cross_ent = word_cost(scores[:-1],ids[1:]) cost = log_likelihood #+ 1e-4 * sum( T.sum(abs(w)) for w in P.values() ) obv_cost = cross_ent return scores, cost, obv_cost, P
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-4*l2 grads = [ T.clip(g,-10,10) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(params,grads,learning_rate = 1e-5) ) return P,train
def make_train(input_size,output_size,mem_size,mem_width,hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P,mem_size,mem_width,hidden_size,ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M,weights,output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-3*l2 # clip gradients grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, updates=updates.adadelta(params,grads) ) return P,train
def prepare_functions(input_size, hidden_size, latent_size, step_count, batch_size, train_X, valid_X): P = Parameters() encode_decode = model.build(P, input_size=input_size, hidden_size=hidden_size, latent_size=latent_size) P.W_decoder_input_0.set_value(P.W_decoder_input_0.get_value() * 10) X = T.matrix('X') step_count = 10 parameters = P.values() cost_symbs = [] for s in xrange(step_count): Z_means, Z_stds, alphas, \ X_mean, log_pi_samples = encode_decode(X, step_count=s + 1) batch_recon_loss, log_p = model.recon_loss(X, X_mean, log_pi_samples) recon_loss = T.mean(batch_recon_loss, axis=0) reg_loss = T.mean(model.reg_loss(Z_means, Z_stds, alphas), axis=0) vlb = recon_loss + reg_loss corr = T.mean(T.eq(T.argmax(log_p, axis=0), T.argmax(log_pi_samples, axis=0)), axis=0) cost = cost_symbs.append(vlb) avg_cost = sum(cost_symbs) / step_count cost = avg_cost + 1e-3 * sum(T.sum(T.sqr(w)) for w in parameters) gradients = updates.clip_deltas(T.grad(cost, wrt=parameters), 5) print "Updated parameters:" pprint(parameters) idx = T.iscalar('idx') train = theano.function( inputs=[idx], outputs=[ vlb, recon_loss, reg_loss, T.max(T.argmax(log_pi_samples, axis=0)), corr ], updates=updates.adam(parameters, gradients, learning_rate=1e-4), givens={X: train_X[idx * batch_size:(idx + 1) * batch_size]}) validate = theano.function(inputs=[], outputs=vlb, givens={X: valid_X}) sample = theano.function(inputs=[], outputs=[ X, X_mean, T.argmax(log_pi_samples, axis=0), T.exp(log_pi_samples) ], givens={X: valid_X[:10]}) return train, validate, sample
def make_functions( input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs,'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:,-output_length:,:-2] Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)"%(time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop( params, grads, learning_rate=1e-4, P=P_learn ) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function( inputs=[input_seqs, output_seqs], outputs=bits_loss ) print "Done. (%0.3f s)"%(time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X,aux=aux) error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))]) error = error[-(Y.shape[0]/2):] parameters = P.values() gradients = T.grad(T.sum(error),wrt=parameters) shapes = [ p.get_value().shape for p in parameters ] count = theano.shared(np.float32(0)) acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [ (g / count) for g in acc_grads ] avg_grads = [ clip(g,1) for g in acc_grads ] acc = theano.function( inputs=[X,Y], outputs=T.mean(error), updates = acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear ) test = theano.function( inputs=[X], outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):], ) return acc,update,test
def build_network(input_size,hidden_size,constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size,hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X,P.W_input_hidden)+P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden,P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X,output,P) if constraint_adj:pass #cost = cost + adjacency_constraint(hidden_lin) return X,output,cost,P
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X, aux=aux) error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))]) error = error[-(Y.shape[0] / 2):] parameters = P.values() gradients = T.grad(T.sum(error), wrt=parameters) shapes = [p.get_value().shape for p in parameters] count = theano.shared(np.float32(0)) acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [(g / count) for g in acc_grads] avg_grads = [clip(g, 1) for g in acc_grads] acc = theano.function( inputs=[X, Y], outputs=T.mean(error), updates=acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) + acc_clear) test = theano.function( inputs=[X], outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):], ) return acc, update, test
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] response_length = input_seq.shape[0]/2 train = theano.function( inputs=[input_seq,output_seq], outputs=T.mean(cross_entropy[-response_length:]), updates=updates.adadelta(params,grads) ) return P,train
def create_model(ids, vocab2id, size): word_vector_size = size hidden_state_size = size P = Parameters() P.V = create_vocab_vectors(P, vocab2id, word_vector_size) P.W_predict = np.zeros(P.V.get_value().shape).T P.b_predict = np.zeros((P.V.get_value().shape[0], )) X = P.V[ids] step = build_lstm_step(P, word_vector_size, hidden_state_size) [states, _], _ = theano.scan(step, sequences=[X], outputs_info=[P.init_h, P.init_c]) scores = T.dot(states, P.W_predict) + P.b_predict scores = T.nnet.softmax(scores) log_likelihood, cross_ent = word_cost(scores[:-1], ids[1:]) cost = log_likelihood #+ 1e-4 * sum( T.sum(abs(w)) for w in P.values() ) obv_cost = cross_ent return scores, cost, obv_cost, P
def make_model(input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') [M_curr, weights, output] = predict(input_seq) test_fun = theano.function(inputs=[input_seq], outputs=[weights, output]) return P, test_fun
def __init__(self, hidden_size, input_size, vocab_size, entropy_reg = 0.001, key_entropy_reg = 0.001, stack_size=1, celltype=LSTM): # core layer in RNN/LSTM self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) self.entropy_reg = entropy_reg self.key_entropy_reg = key_entropy_reg self.turing_params = Parameters() #init turing machine model self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size) self.hidden_size = hidden_size # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.mask_matrix = T.imatrix() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: #change by darong #issue : what is greedy self.lstm_predictions = self.create_lstm_prediction() self.final_predictions,self.entropy,self.key_entropy = self.create_final_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_lstm_prediction(greedy=True) # create gradient training functions: self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = 0.01 self.turing_lr = 0.01 self.all_lr = 0.01 self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) # create ppl self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function()
def make_functions(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs, 'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:, -output_length:, :-2] Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)" % (time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop(params, grads, learning_rate=1e-4, P=P_learn) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss) print "Done. (%0.3f s)" % (time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_model(input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_size=100): """ Given the model parameters, return a Theano function for the NTM's model """ P = Parameters() # Build the controller ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') [M_curr, weights, output] = predict(input_seq) # Return a Theano function for the NTM test_fun = theano.function(inputs=[input_seq], outputs=[weights, output]) return P, test_fun
def build_network(input_size, hidden_size, constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size, hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X, output, P) if constraint_adj: pass #cost = cost + adjacency_constraint(hidden_lin) return X, output, cost, P
acc_size = 0 for i, size in enumerate(input_sizes): P["W_%s_%d" % (name, i)] = weights[acc_size:acc_size + size] Ws.append(P["W_%s_%d" % (name, i)]) acc_size += size P["b_%s" % name] = np.zeros((output_size, ), dtype=np.float32) b = P["b_%s" % name] def transform(Xs): acc = 0. for X, W in zip(Xs, Ws): if X.dtype.startswith('int'): acc += W[X] else: acc += T.dot(X, W) output = activation(acc + b) output.name = name return output return transform if __name__ == "__main__": import vae P = Parameters() inferer = build_classifier(P, "z1_latent", [10, 5], [5, 5, 5, 5], 5) print inferer( [T.constant(np.arange(5)), T.constant(np.eye(5, dtype=np.float32))]).eval()
P.b_output = np.zeros((output_size, )) def model(X): hidden = lstm_layer(X)[1] return T.nnet.softmax(T.dot(hidden, P.W_output) + P.b_output) return model def label_seq(string): idxs = font.indexify(string) return idxs if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') predict = build_model(P, 8, 512, len(font.chars) + 1) probs = predict(X) alpha = 0.5 params = P.values() cost = ctc.cost(probs, Y) #+ 1e-8 * sum(T.sum(T.sqr(w)) for w in params) gradients = T.grad(cost, wrt=params) gradient_acc = [theano.shared(0 * p.get_value()) for p in params] counter = theano.shared(np.float32(0.)) acc = theano.function(inputs=[X, Y], outputs=cost,
) return acc,update if __name__ == "__main__": training_file = sys.argv[1] compute_tree_exists = False vocab_in = vocab.load("qa2.pkl") vocab_size = len(vocab_in) print "Vocab size is:", vocab_size evidence_count = 2 if compute_tree_exists: inputs,outputs,params,grads = pickle.load(open("compute_tree.pkl")) else: print "Creating compute tree...", P = Parameters() story = T.ivector('story') idxs = T.ivector('idxs') qstn = T.ivector('qstn') ans_evds = T.ivector('ans_evds') ans_lbl = T.iscalar('ans_lbl') attention = model.build(P, word_rep_size = 128, stmt_hidden_size = 128, diag_hidden_size = 128, vocab_size = vocab_size, output_size = vocab_size, map_fun_size = 128, evidence_count = evidence_count )
return acc, update if __name__ == "__main__": training_file = sys.argv[1] compute_tree_exists = False vocab_in = vocab.load("qa2.pkl") vocab_size = len(vocab_in) print "Vocab size is:", vocab_size evidence_count = 2 if compute_tree_exists: inputs, outputs, params, grads = pickle.load(open("compute_tree.pkl")) else: print "Creating compute tree...", P = Parameters() story = T.ivector('story') idxs = T.ivector('idxs') qstn = T.ivector('qstn') ans_evds = T.ivector('ans_evds') ans_lbl = T.iscalar('ans_lbl') attention = model.build(P, word_rep_size=128, stmt_hidden_size=128, diag_hidden_size=128, vocab_size=vocab_size, output_size=vocab_size, map_fun_size=128, evidence_count=evidence_count)
predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output) return X, predict def label_seq(string): idxs = font.indexify(string) result = np.ones((len(idxs) * 2 + 1,), dtype=np.int32) * -1 result[np.arange(len(idxs)) * 2 + 1] = idxs print result return result if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') X, predict = build_model(P, X, 10, 10, 10) cost = ctc.cost(predict, Y) params = P.values() grad = T.grad(cost, wrt=params) train = theano.function( inputs=[X, Y], outputs=cost, updates=updates.adadelta(params, grad) ) for _ in xrange(10): print train(np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))
P.W_output = np.zeros((hidden_size,output_size)) P.b_output = np.zeros((output_size,)) def model(X): hidden = lstm_layer(X)[1] return T.nnet.softmax(T.dot(hidden,P.W_output) + P.b_output) return model def label_seq(string): idxs = font.indexify(string) return idxs if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') predict = build_model(P,8,512,len(font.chars)+1) probs = predict(X) alpha = 0.5 params = P.values() cost = ctc.cost(probs, Y) #+ 1e-8 * sum(T.sum(T.sqr(w)) for w in params) gradients = T.grad(cost, wrt=params) gradient_acc = [ theano.shared(0 * p.get_value()) for p in params ] counter = theano.shared(np.float32(0.)) acc = theano.function(
B1 = -0.5*(((z2-w1)/0.4)**2) - 0.1 * w4 B2 = -0.5*(((z2-w1+w3)/0.35)**2) - 0.1 * w4 B3 = -0.5*(z1**2 + z2**2/5.) return lse(lse(B1,B2),B3) from theano_toolkit.parameters import Parameters from theano_toolkit import updates from pprint import pprint floatX = theano.config.floatX print 'building model' z0 = T.matrix('z0') P = Parameters() iaf, masks = iaf_made_wn(P,L=8,num_units=64, num_hids=1,nonl=T.nnet.elu, cond_bias=False) zT, ss = iaf(z0,cond_bias=None) parameters = P.values() pprint(parameters) logp = U(zT) logq = - ss losses = logq - logp loss = losses.mean() gradients = updates.clip_deltas(T.grad(loss, wrt=parameters), 5) P_train = Parameters() fupdates = updates.adam(parameters, gradients,
if __name__ == "__main__": batch_size = 256 validation = 0.1 all_X, all_W, all_Y = data.load('data/training.csv') validation_count = int(math.ceil(all_X.shape[0] * validation)) train_X, train_W, train_Y = (all_X[:-validation_count], all_W[:-validation_count], all_Y[:-validation_count]) valid_X, valid_W, valid_Y = (all_X[-validation_count:], all_W[-validation_count:], all_Y[-validation_count:]) P = Parameters() data_X = theano.shared(train_X) data_W = theano.shared(train_W) data_Y = theano.shared(train_Y) train, test = get_train_test_fn(P, data_X, data_W, data_Y) batches = int(math.ceil(train_X.shape[0] / float(batch_size))) best_score = -np.inf for epoch in xrange(20): for i in xrange(batches): train(i, batch_size) scores = test(valid_X, valid_W, valid_Y) print scores, if scores[0] > best_score : P.save('model.pkl') best_score = scores[0]
import theano.tensor as T import numpy as np from theano_toolkit import utils as U from theano_toolkit import hinton from theano_toolkit import updates from theano_toolkit.parameters import Parameters import ctc import font import lstm from ocr import * if __name__ == "__main__": import sys test_word = sys.argv[1] P = Parameters() X = T.matrix('X') predict = build_model(P,8,512,len(font.chars)+1) probs = predict(X) test = theano.function(inputs=[X],outputs=probs) P.load('model.pkl') image = font.imagify(test_word) hinton.plot(image.astype(np.float32).T[::-1]) y_seq = label_seq(test_word) probs = test(image) print " ", ' '.join(font.chars[i] if i < len(font.chars) else "_" for i in np.argmax(probs,axis=1)) hinton.plot(probs[:,y_seq].T,max_arr=1.)
forget_gate = T.nnet.sigmoid(forget_lin) cell_updates = T.tanh(cell_lin) cell = forget_gate * prev_cell + in_gate * cell_updates out_lin = x_o + h_o + b_o + T.dot(cell, V_o) out_gate = T.nnet.sigmoid(out_lin) hid = out_gate * T.tanh(cell) return cell, hid return step if __name__ == "__main__": P = Parameters() X = T.ivector('X') P.V = np.zeros((8, 8), dtype=np.int32) X_rep = P.V[X] P.W_output = np.zeros((15, 8), dtype=np.int32) lstm_layer = build(P, name="test", input_size=8, hidden_size=15) _, hidden = lstm_layer(X_rep) output = T.nnet.softmax(T.dot(hidden, P.W_output)) delay = 5 label = X[:-delay] predicted = output[delay:] cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label])) params = P.values()
in_gate = T.nnet.sigmoid(in_lin) forget_gate = T.nnet.sigmoid(forget_lin) cell_updates = T.tanh(cell_lin) cell = forget_gate * prev_cell + in_gate * cell_updates out_lin = x_o + h_o + b_o + T.dot(cell,V_o) out_gate = T.nnet.sigmoid(out_lin) hid = out_gate * T.tanh(cell) return cell,hid return step if __name__ == "__main__": P = Parameters() X = T.ivector('X') P.V = np.zeros((8,8),dtype=np.int32) X_rep = P.V[X] P.W_output = np.zeros((15,8),dtype=np.int32) lstm_layer = build(P, name = "test", input_size = 8, hidden_size =15 ) _,hidden = lstm_layer(X_rep) output = T.nnet.softmax(T.dot(hidden,P.W_output)) delay = 5 label = X[:-delay]
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate ,grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build( P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build( P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr,weights,output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output, output_seq),axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate ) ) self.predict_cost = theano.function( inputs=[input_seq,output_seq], outputs= cost ) self.predict = theano.function( inputs=[input_seq], outputs= [ weights, output] )
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate, grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build(P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build(P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr, weights, output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output, output_seq), axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq, output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates=updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate)) self.predict_cost = theano.function(inputs=[input_seq, output_seq], outputs=cost) self.predict = theano.function(inputs=[input_seq], outputs=[weights, output])
def crossentropy(output,Y): if output.owner.op == T.nnet.softmax_op: x = output.owner.inputs[0] k = T.max(x,axis=1,keepdims=True) sum_x = T.log(T.sum(T.exp(x - k),axis=1)) + k return - x[T.arange(x.shape[0]),Y] + sum_x else: return T.nnet.categorical_crossentropy(outputs,Y) if __name__ == "__main__": config.parse_args() total_frames = sum(x.shape[0] for x,_ in frame_label_data.training_stream()) logging.info("Total frames: %d"%total_frames) P = Parameters() predict = model.build(P) X = T.matrix('X') Y = T.ivector('Y') _,outputs = predict(X) cross_entropy = T.mean(crossentropy(outputs,Y)) parameters = P.values() loss = cross_entropy + \ (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters) gradients = T.grad(loss,wrt=parameters) logging.info("Parameters to tune:" + ', '.join(sorted(w.name for w in parameters))) update_vars = Parameters() logging.debug("Compiling functions...")
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) : #initialize model P = Parameters() image_projecting = image_project.build(P, image_size, proj_size) batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size) image_vector = T.vector() #training correct_triplet = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_vector = image_projecting(image_vector) image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0) correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2]) negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2]) correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector) negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix) #margin cost zero_cost = T.zeros_like(negative_cross_dot_vector) margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost) #regulizar cost params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] lr = T.scalar(name='learning rate',dtype='float32') train = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr], outputs=cost, updates=updates.rmsprop(params, grads, learning_rate=lr), allow_input_downcast=True ) #valid valid = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]], outputs=cost, allow_input_downcast=True ) #testing all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0) all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2]) all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix) test = theano.function( inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]], outputs=all_cross_dot_vector, allow_input_downcast=True ) #default P_default = Parameters() P_default['left'] = 2 * (np.random.rand(word_size) - 0.5) P_default['right'] = 2 * (np.random.rand(word_size) - 0.5) P_default['relation'] = 2 * (np.random.rand(word_size) - 0.5) correct_triplet_d = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] correct_triplet_d_train = [correct_triplet_d,correct_triplet_d,correct_triplet_d] negative_triplet_d_train = [negative_triplet_d,negative_triplet_d,negative_triplet_d] cost = 0 for i in range(3) : if i == 0 : correct_triplet_d_train[0] = [correct_triplet_d[0],P_default['relation'],P_default['right']] negative_triplet_d_train[0] = [negative_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0)] elif i == 1 : correct_triplet_d_train[1] = [P_default['left'],correct_triplet_d[1],P_default['right']] negative_triplet_d_train[1] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0),negative_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0)] elif i == 2 : correct_triplet_d_train[2] = [P_default['left'],P_default['relation'],correct_triplet_d[2]] negative_triplet_d_train[2] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),negative_triplet_d[2]] image_projection_matrix_d = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet_d[i].shape[0] , axis=0) correct_triplet_encoding_vector_d = vector_triplet_encoding(correct_triplet_d_train[i][0] , correct_triplet_d_train[i][1] , correct_triplet_d_train[i][2]) negative_triplet_encoding_matrix_d = batched_triplet_encoding(negative_triplet_d_train[i][0] , negative_triplet_d_train[i][1] , negative_triplet_d_train[i][2]) correct_cross_dot_scalar_d = T.dot(image_projection_vector , correct_triplet_encoding_vector_d) negative_cross_dot_vector_d = T.batched_dot(image_projection_matrix_d , negative_triplet_encoding_matrix_d) #margin cost zero_cost_d = T.zeros_like(negative_cross_dot_vector_d) margin_cost_d = 1 - correct_cross_dot_scalar_d + negative_cross_dot_vector_d cost_vector_d = T.switch(T.gt(zero_cost_d , margin_cost_d) , zero_cost_d , margin_cost_d) cost = cost + T.sum(cost_vector_d)/T.shape(negative_triplet[i])[0] params_d = P_default.values() l2 = T.sum(0) for p in params_d: l2 = l2 + (p ** 2).sum() cost = cost + 0.01*l2 grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params_d)] train_default = theano.function( inputs=[image_vector, correct_triplet_d[0], correct_triplet_d[1], correct_triplet_d[2], negative_triplet_d[0], negative_triplet_d[1], negative_triplet_d[2], lr], outputs=cost, updates=updates.rmsprop(params_d, grads, learning_rate=lr), allow_input_downcast=True ) all_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] all_triplet_d_test = [all_triplet_d,all_triplet_d,all_triplet_d] result = [[],[],[]] for i in range(3) : image_projection_matrix_test_d = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[i].shape[0] , axis=0) if i == 0 : all_triplet_d_test[0] = [all_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0)] elif i == 1 : all_triplet_d_test[1] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0),all_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0)] elif i == 2 : all_triplet_d_test[2] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),all_triplet_d[2]] all_triplet_encoding_matrix_d = batched_triplet_encoding(all_triplet_d_test[i][0] , all_triplet_d_test[i][1] , all_triplet_d_test[i][2]) result[i] = T.batched_dot(image_projection_matrix_test_d , all_triplet_encoding_matrix_d) test_default = theano.function( inputs=[image_vector, all_triplet_d[0], all_triplet_d[1], all_triplet_d[2]], outputs=result, allow_input_downcast=True ) return P , P_default , train , valid , test , train_default , test_default
forget_gate = T.nnet.sigmoid(forget_lin) cell_updates = T.tanh(cell_lin) cell = forget_gate * prev_cell + in_gate * cell_updates out_lin = x_o + h_o + b_o + T.dot(cell, V_o) out_gate = T.nnet.sigmoid(out_lin) hid = out_gate * T.tanh(cell) return cell, hid return step if __name__ == "__main__": P = Parameters() X = T.ivector("X") P.V = np.zeros((8, 8), dtype=np.int32) X_rep = P.V[X] P.W_output = np.zeros((15, 8), dtype=np.int32) lstm_layer = build(P, name="test", input_size=8, hidden_size=15) _, hidden = lstm_layer(X_rep) output = T.nnet.softmax(T.dot(hidden, P.W_output)) delay = 5 label = X[:-delay] predicted = output[delay:] cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label])) params = P.values()
import theano.tensor as T import numpy as np from theano_toolkit.parameters import Parameters import data_io import model import vae import matplotlib import sys matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.animation import FuncAnimation if __name__ == "__main__": model.SAMPLED_LAYERS = [int(s) for s in sys.argv[1:]] print model.SAMPLED_LAYERS P = Parameters() autoencoder, inpaint = model.build(P) parameters = P.values() X = T.itensor4('X') X_hat, posteriors, priors = \ autoencoder(T.cast(X, 'float32') / np.float32(255.)) latent_kls = [ T.mean(vae.kl_divergence(po_m, po_s, pr_m, pr_s), axis=0) for (po_m, po_s), (pr_m, pr_s) in zip(posteriors, priors) ] recon_loss = model.cost(X_hat, X[:, :, 16:-16, 16:-16]) val_loss = (recon_loss + sum(latent_kls)) / (32**2) X_recon = inpaint(T.cast(X, 'float32') / np.float32(255.)) Y = model.predict(X_recon)
import theano import theano.tensor as T import numpy as np import sys import data import model from theano_toolkit.parameters import Parameters from theano_toolkit import updates if __name__ == '__main__': model_filename = sys.argv[1] test_filename = sys.argv[2] train_filename = sys.argv[3] P = Parameters() data_X, df = data.load_test(test_filename, train_filename) f = model.build(P, input_size=data_X.shape[1], hidden_sizes=[256, 128, 64, 32] ) X = T.matrix('X') predict = theano.function( inputs=[X], outputs=f(X, test=True) > 0.5, ) P.load(model_filename) output = predict(data_X) print data_X.shape print output.shape print df.values.shape
import theano.tensor as T import numpy as np from theano_toolkit import utils as U from theano_toolkit import hinton from theano_toolkit import updates from theano_toolkit.parameters import Parameters import ctc import font import lstm from ocr import * if __name__ == "__main__": import sys test_word = sys.argv[1] P = Parameters() X = T.matrix('X') predict = build_model(P, 8, 512, len(font.chars) + 1) probs = predict(X) test = theano.function(inputs=[X], outputs=probs) P.load('model.pkl') image = font.imagify(test_word) hinton.plot(image.astype(np.float32).T[::-1]) y_seq = label_seq(test_word) probs = test(image) print " ", ' '.join(font.chars[i] if i < len(font.chars) else "_" for i in np.argmax(probs, axis=1)) hinton.plot(probs[:, y_seq].T, max_arr=1.)
import theano import theano.tensor as T import numpy as np import vocab import model from theano_toolkit.parameters import Parameters if __name__ == "__main__": model_file = args.model_file temp_input = args.temperature id2char = pickle.load(args.vocab_file) char2id = vocab.load(args.vocab_file.name) prime_str = args.prime P = Parameters() sampler = model.build_sampler(P, character_count=len(char2id) + 1, embedding_size=20, hidden_size=100) P.load(model_file) temp = T.scalar('temp') char = T.iscalar('char') p_cell_1, p_hidden_1, p_cell_2, p_hidden_2 = T.vector( "p_cell_1"), T.vector("p_hidden_2"), T.vector("p_cell_2"), T.vector( "p_hidden_2") output, cell_1, hidden_1, cell_2, hidden_2 = sampler( temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2) sample = theano.function( inputs=[temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2],
def crossentropy(output, Y): if output.owner.op == T.nnet.softmax_op: x = output.owner.inputs[0] k = T.max(x, axis=1, keepdims=True) sum_x = T.log(T.sum(T.exp(x - k), axis=1)) + k return -x[T.arange(x.shape[0]), Y] + sum_x else: return T.nnet.categorical_crossentropy(outputs, Y) if __name__ == "__main__": config.parse_args() total_frames = sum(x.shape[0] for x, _ in frame_label_data.training_stream()) logging.info("Total frames: %d" % total_frames) P = Parameters() predict = model.build(P) X = T.matrix('X') Y = T.ivector('Y') _, outputs = predict(X) cross_entropy = T.mean(crossentropy(outputs, Y)) parameters = P.values() loss = cross_entropy + \ (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters) gradients = T.grad(loss, wrt=parameters) logging.info("Parameters to tune:" + ', '.join(sorted(w.name for w in parameters))) update_vars = Parameters()
) for p,g in zip(parameters,gradients) ] def weight_norm(u,norm=1.9356): in_norm = T.sqrt(T.sum(T.sqr(u),axis=0)) ratio = T.minimum(norm,in_norm) / (in_norm + 1e-8) return ratio * u def normalise_weights(updates): return [ ( p, weight_norm(u) if p.name.startswith('W') else u ) for p,u in updates ] if __name__ == "__main__": P = Parameters() extract,_ = model.build(P, "vrnn") X = T.tensor3('X') l = T.ivector('l') [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X,l) parameters = P.values() batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std,l) print "Calculating gradient..." print parameters batch_size = T.cast(X.shape[1],'float32') gradients = T.grad(batch_cost,wrt=parameters) gradients = [ g / batch_size for g in gradients ]
pool_factor, activation=activation): conv = build_conv_layer(P, name, input_size, output_size, rfield_size, activation) def upsample(X): upsamp_X = T.nnet.abstract_conv.bilinear_upsampling(X, pool_factor) Y = conv(upsamp_X) return Y return upsample if __name__ == "__main__": from theano_toolkit.parameters import Parameters P = Parameters() image_size = 10 rfield_size = 5 input_size = 1 X = T.as_tensor_variable( np.random.randn(1, input_size, 4, 4).astype(np.float32)) P.W = conv_weight_init(2, 1, 5) P.b = np.zeros(2) W = P.W b = P.b.dimshuffle('x', 0, 'x', 'x') upsamp_X_1 = T.zeros( (X.shape[0], X.shape[1], 2 * X.shape[2], 2 * X.shape[3])) upsamp_X_1 = T.set_subtensor(upsamp_X_1[:, :, ::2, ::2], X) upsamp_X_1 = T.inc_subtensor(upsamp_X_1[:, :, 1:-1:2, ::2], 0.5 * (X[:, :, :-1] + X[:, :, 1:]))
predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output) return X, predict def label_seq(string): idxs = font.indexify(string) result = np.ones((len(idxs) * 2 + 1, ), dtype=np.int32) * -1 result[np.arange(len(idxs)) * 2 + 1] = idxs print result return result if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') X, predict = build_model(P, X, 10, 10, 10) cost = ctc.cost(predict, Y) params = P.values() grad = T.grad(cost, wrt=params) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates.adadelta(params, grad)) for _ in xrange(10): print train( np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))
import theano import theano.tensor as T import numpy as np import vocab import model from theano_toolkit.parameters import Parameters if __name__ == "__main__": model_file = args.model_file temp_input = args.temperature id2char = pickle.load(args.vocab_file) char2id = vocab.load(args.vocab_file.name) prime_str = args.prime P = Parameters() sampler = model.build_sampler(P, character_count=len(char2id) + 1, embedding_size=20, hidden_size=100 ) P.load(model_file) temp = T.scalar('temp') char = T.iscalar('char') p_cell_1, p_hidden_1, p_cell_2, p_hidden_2 = T.vector("p_cell_1"), T.vector("p_hidden_2"), T.vector("p_cell_2"), T.vector("p_hidden_2") output, cell_1, hidden_1, cell_2, hidden_2 = sampler(temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2) sample = theano.function( inputs=[temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2], outputs=[output, cell_1, hidden_1, cell_2, hidden_2] )
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # core layer in RNN/LSTM self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) self.turing_params = Parameters() #init turing machine model self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: #change by darong #issue : what is greedy self.lstm_predictions = self.create_lstm_prediction() self.final_predictions = self.create_final_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_lstm_prediction(greedy=True) # create gradient training functions: self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = 0.01 self.turing_lr = 0.01 self.all_lr = 0.01 self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) # create ppl self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function() def save(self, save_file, vocab): pickle.dump(self.model, open(save_file, "wb")) # pickle is for lambda function, cPickle cannot pickle.dump(vocab, open(save_file+'.vocab', "wb")) # pickle is for lambda function, cPickle cannot def save_turing(self, save_file): self.turing_params.save(save_file + '.turing') def load(self, load_file, lr): self.model = pickle.load(open(load_file, "rb")) if os.path.isfile(load_file + '.turing') : self.turing_params.load(load_file + '.turing') else : print "no turing model!!!! pretrain with lstm param" self.turing_params['W_input_hidden'] = self.model.layers[-1].params[0].get_value().T #not sure self.turing_params['W_read_hidden'] = self.model.layers[-1].params[0].get_value().T self.turing_params['b_hidden_0'] = self.model.layers[-1].params[1].get_value() temp = self.model.layers[1].initial_hidden_state.get_value()[self.hidden_size:] self.turing_params['memory_init'] = temp.reshape((1,)+temp.shape) # need to compile again for calculating predictions after loading lstm self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) self.lstm_predictions = self.create_lstm_prediction() self.final_predictions = self.create_final_prediction() self.greedy_predictions = self.create_lstm_prediction(greedy=True)#can change to final self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = lr self.turing_lr = lr#change this self.all_lr = lr self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function() print "done loading model" # print "done compile" def stop_on(self, idx): self._stop_word.set_value(idx) @property def params(self): return self.model.params def create_lstm_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: return result[-1].transpose((2,0,1)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_final_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: hidden_size = result[-2].shape[2]/2 turing_result = self.turing_predict(result[-2][:,:,hidden_size:]) #the last layer do transpose before compute return turing_result.transpose((1,0,2)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.lstm_cost = masked_loss(self.lstm_predictions, what_to_predict, for_how_long, starting_when).sum() self.final_cost = masked_loss(self.final_predictions, what_to_predict, for_how_long, starting_when).sum() def create_predict_function(self): self.lstm_pred_fun = theano.function( inputs=[self.input_mat], outputs=self.lstm_predictions, allow_input_downcast=True ) self.final_pred_fun = theano.function( inputs=[self.input_mat], outputs=self.final_predictions, allow_input_downcast=True ) self.greedy_fun = theano.function( inputs=[self.priming_word], outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]), allow_input_downcast=True ) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.lstm_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_cost, updates=updates, allow_input_downcast=True) updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.turing_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_turing, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), allow_input_downcast=True) all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True) all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr) updates_all = all_updates_lstm for pair in all_updates_turing_temp : updates_all[pair[0]] = pair[1] self.all_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_all, allow_input_downcast=True) def create_lstm_ppl(self): def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count result, _ = theano.scan(fn=timestep, sequences=[ self.lstm_predictions, self.input_mat[:, 1:], self.for_how_long ], non_sequences=T.sum(self.for_how_long)) oov_count_total = T.sum(result[1]) return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX) def create_final_ppl(self): def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count result, _ = theano.scan(fn=timestep, sequences=[ self.final_predictions, self.input_mat[:, 1:], self.for_how_long ], non_sequences=T.sum(self.for_how_long)) oov_count_total = T.sum(result[1]) return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX) def create_ppl_function(self): self.lstm_ppl_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_ppl, allow_input_downcast=True) self.final_ppl_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_ppl, allow_input_downcast=True) def __call__(self, x): return self.pred_fun(x)#any problem??
max_epochs = args.max_epochs batch_size = args.batch_size improvement_threshold = args.improvement_threshold validation_percent = args.validation_percent patience = args.patience checkpoint = args.checkpoint embedding_size = args.embedding_size hidden_size = args.hidden_size l2_coefficient = args.l2 id2char = pickle.load(open(vocab_file,'r')) char2id = vocab.load(vocab_file) P = Parameters() lang_model = model.build(P, character_count=len(char2id) + 1, embedding_size=embedding_size, hidden_size=hidden_size ) def cost(X, P): # batch_size x time eps = 1e-3 X = X.T # time x batch_size char_prob_dist = lang_model(X[:-1]) # time x batch_size x output_size char_prob_dist = (1 - 2 * eps) * char_prob_dist + eps label_prob = char_prob_dist[ T.arange(X.shape[0] - 1).dimshuffle(0, 'x'), T.arange(X.shape[1]).dimshuffle('x', 0), X[1:]
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) : #initialize model P = Parameters() image_projecting = image_project.build(P, image_size, proj_size) batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size) image_vector = T.vector() #training correct_triplet = [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E] negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_vector = image_projecting(image_vector) image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0) correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2]) negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2]) correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector) negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix) #margin cost zero_cost = T.zeros_like(negative_cross_dot_vector) margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost) #regulizar cost params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] lr = T.scalar(name='learning rate',dtype='float32') train = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr], outputs=cost, updates=updates.rmsprop(params, grads, learning_rate=lr), allow_input_downcast=True ) #valid valid = theano.function( inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]], outputs=cost, allow_input_downcast=True ) #visualize image_project_fun = theano.function( inputs=[image_vector], outputs=image_projection_vector, allow_input_downcast=True ) #testing all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')] image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0) all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2]) all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix) test = theano.function( inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]], outputs=all_cross_dot_vector, allow_input_downcast=True ) return P , train , valid , image_project_fun , test
# TODO: fix these magic numbers (especially the 800) def f(X): layer0 = X.reshape((X.shape[0], 1, 28, 28)) layer1 = _build_conv_pool(P, 1, layer0, 20, 1, 5, 2) layer2_= _build_conv_pool(P, 2, layer1, 50, 20, 5, 2) layer2 = layer2_.flatten(2) output = T.nnet.softmax(T.dot(layer2, P.W_hidden_output) + P.b_output) return output return f def cost(P, Y_hat, Y, l2 = 0): return (T.mean(T.nnet.categorical_crossentropy(Y_hat, Y)) + l2 * sum(T.mean(p**2) for p in P.values())) if __name__ == "__main__": import datasets x,y = datasets.mnist() x,y = x[0:1000],y[0:1000] P = Parameters() X = T.matrix('X') Y = T.ivector('Y') net = build(P, 784, 800, 10) Y_hat = net(X) f = theano.function(inputs = [X], outputs = Y_hat) J = cost(P, Y_hat, Y) grad = T.grad(J, wrt=P.values())
] def weight_norm(u, norm=1.9356): in_norm = T.sqrt(T.sum(T.sqr(u), axis=0)) ratio = T.minimum(norm, in_norm) / (in_norm + 1e-8) return ratio * u def normalise_weights(updates): return [(p, weight_norm(u) if p.name.startswith('W') else u) for p, u in updates] if __name__ == "__main__": P = Parameters() extract, _ = model.build(P, "vrnn") X = T.tensor3('X') l = T.ivector('l') [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l) parameters = P.values() batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std, l) print "Calculating gradient..." print parameters batch_size = T.cast(X.shape[1], 'float32') gradients = T.grad(batch_cost, wrt=parameters) gradients = [g / batch_size for g in gradients] gradients = clip(5, parameters, gradients)