def run_experiment(constraint_adj=False): X,output,cost,P= build_network(8,3,constraint_adj) parameters = P.values() grads = T.grad(cost,wrt=parameters) train = theano.function( inputs=[X], outputs=cost, updates=updates.adadelta(parameters,grads) ) test = theano.function( inputs=[X], outputs=output, ) data = np.eye(8,dtype=np.int8) # data = np.vstack((data,)) print "Training..." for _ in xrange(100000): np.random.shuffle(data) train(data) hidden_activations = theano.function( inputs=[X], outputs=T.nnet.sigmoid(T.dot(X,P.W_input_hidden)+P.b_hidden) ) #print_arr(test(np.eye(8,dtype=np.int32))) #print_arr(1/(1 + np.exp(-parameters[0].get_value())),1) return hinton(hidden_activations(np.eye(8,dtype=np.int8)))
def make_train(input_size,output_size,mem_size,mem_width,hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P,mem_size,mem_width,hidden_size,ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M,weights,output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-3*l2 # clip gradients grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, updates=updates.adadelta(params,grads) ) return P,train
def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-3 * l2 grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function( inputs=[input_seq, output_seq], outputs=cost, updates=updates.adadelta(params, grads) ) return P, train
def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=T.sum(cross_entropy), updates=updates.adadelta(params, grads)) return P, train
def make_functions(inputs,outputs,params,grads,lr): shapes = [ p.get_value().shape for p in params ] acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ] count = theano.shared(np.float32(0)) acc_update = [ (a,a+g) for a,g in zip(acc_grads,grads) ] + [ (count,count + 1.) ] # deltas = acc_grads deltas = [ ag / count for ag in acc_grads ] grads_norms = [ T.sqrt(T.sum(g**2)) for g in deltas ] deltas = [ T.switch(T.gt(n,1.),1.*g/n,g) for n,g in zip(grads_norms,deltas) ] # param_update = [ (p, p - lr * g) for p,g in zip(params,deltas) ] param_update = updates.adadelta(params,deltas,learning_rate=lr) # ,learning_rate=lr,rho=np.float32(0.95) clear_update = [ (a,np.zeros(s,dtype=np.float32)) for a,s in zip(acc_grads,shapes) ] + [ (count,0) ] acc = theano.function( inputs = inputs, outputs = [outputs,output_ans[ans_lbl]], updates = acc_update, on_unused_input='warn', # mode=theano.compile.MonitorMode(post_func=detect_nan) ) update = theano.function( inputs=[lr], updates = param_update + clear_update, outputs = [ T.sqrt(T.sum(T.sqr(w))) for w in deltas ], on_unused_input='warn', # mode=theano.compile.MonitorMode(post_func=detect_nan) ) return acc,update
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M, weights, output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + 1e-3 * l2 # clip gradients grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=cost, updates=updates.adadelta(params, grads)) return P, train
def build_model(hidden_size, predict_only=False): X = T.matrix('X') Y = T.ivector('Y') #* (0.001 * U.initial_weights(2,hidden_size) + np.array([[0,0,1,1],[1,1,0,0]]))) W_input_hidden = U.create_shared(U.initial_weights(2, hidden_size)) b_hidden = U.create_shared(U.initial_weights(hidden_size)) W_hidden_predict = U.create_shared(U.initial_weights(hidden_size, 2)) b_predict = U.create_shared(U.initial_weights(2)) params = [W_input_hidden, b_hidden, W_hidden_predict, b_predict] hidden_lin = T.dot(X, W_input_hidden) + b_hidden hidden = T.nnet.sigmoid(hidden_lin) predict = T.nnet.softmax(T.dot(hidden, W_hidden_predict) + b_predict) cost = -T.mean(T.log( predict[T.arange(Y.shape[0]), Y])) + 1e-3 * adjacency_constraint( hidden_lin) # + 1e-4 * sum(T.sum(p**2) for p in params) accuracy = T.mean(T.eq(T.argmax(predict, axis=1), Y)) grad = T.grad(cost, params) train = theano.function( inputs=[X, Y], #updates = updates.momentum(params,grad,0.9999,0.1) if not predict_only else None, #updates = updates.momentum(params,grad,0.999,0.0005), updates=updates.adadelta(params, grad), outputs=[accuracy, W_input_hidden, b_hidden, (hidden > 0.5)]) predict = theano.function(inputs=[X], outputs=predict[:, 0]) i = T.iscalar('i') hidden_p = theano.function(inputs=[X, i], outputs=hidden[:, i]) return train, predict, hidden_p, params
def turing_updates(cost , lr) : params = P.values() #whether add P weight decay l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() all_cost = cost + 1e-3 * l2 clipper = updates.clip(5.) g = T.grad(all_cost, wrt=params) grads = clipper(g) # grads = [T.clip(g, -5, 5) for g in T.grad(all_cost, wrt=params)] # return updates.rmsprop(params, grads, learning_rate=lr) return updates.adadelta(params, grads, learning_rate=lr)
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X,aux=aux) error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))]) error = error[-(Y.shape[0]/2):] parameters = P.values() gradients = T.grad(T.sum(error),wrt=parameters) shapes = [ p.get_value().shape for p in parameters ] count = theano.shared(np.float32(0)) acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [ (g / count) for g in acc_grads ] avg_grads = [ clip(g,1) for g in acc_grads ] acc = theano.function( inputs=[X,Y], outputs=T.mean(error), updates = acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear ) test = theano.function( inputs=[X], outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):], ) return acc,update,test
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X, aux=aux) error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))]) error = error[-(Y.shape[0] / 2):] parameters = P.values() gradients = T.grad(T.sum(error), wrt=parameters) shapes = [p.get_value().shape for p in parameters] count = theano.shared(np.float32(0)) acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [(g / count) for g in acc_grads] avg_grads = [clip(g, 1) for g in acc_grads] acc = theano.function( inputs=[X, Y], outputs=T.mean(error), updates=acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) + acc_clear) test = theano.function( inputs=[X], outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):], ) return acc, update, test
def train_model(docs, wordvec_size, hidden_size, error_threshold, update_mu=1e-3, update_eps=0.95): X, parameters, hidden, hidden1_reproduction, input_reproduction, unrolled = build_network( wordvec_size, hidden_size) #hidden, hidden_rep, input_rep, unrlld = f(docs) error = build_error(X, hidden, hidden1_reproduction, input_reproduction) cost = error # + 1e-6*sum( T.sum(abs(p)) for p in parameters ) gradients = T.grad(cost, wrt=parameters) eps = T.dscalar('eps') mu = T.dscalar('mu') train = theano.function(inputs=[X, eps, mu], updates=updates.adadelta(parameters, gradients, mu, eps), outputs=error) error = 10 count = 0 for i in range(10): start_time = time.time() error = 0 for doc in docs: error += train(doc, update_mu, update_eps) if count % 1 == 0: print "iter=%d" % count, time.time() - start_time, error / len( docs) count += 1 f = theano.function( inputs=[X], outputs=[hidden, hidden1_reproduction, input_reproduction, unrolled]) print "Finish count=%d error=%f" % (count, error) return f
def make_functions(inputs, outputs, params, grads, lr): shapes = [p.get_value().shape for p in params] acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes] count = theano.shared(np.float32(0)) acc_update = [(a, a + g) for a, g in zip(acc_grads, grads)] + [(count, count + 1.)] # deltas = acc_grads deltas = [ag / count for ag in acc_grads] grads_norms = [T.sqrt(T.sum(g**2)) for g in deltas] deltas = [ T.switch(T.gt(n, 1.), 1. * g / n, g) for n, g in zip(grads_norms, deltas) ] # param_update = [ (p, p - lr * g) for p,g in zip(params,deltas) ] param_update = updates.adadelta( params, deltas, learning_rate=lr) # ,learning_rate=lr,rho=np.float32(0.95) clear_update = [(a, np.zeros(s, dtype=np.float32)) for a, s in zip(acc_grads, shapes)] + [(count, 0)] acc = theano.function( inputs=inputs, outputs=[outputs, output_ans[ans_lbl]], updates=acc_update, on_unused_input='warn', # mode=theano.compile.MonitorMode(post_func=detect_nan) ) update = theano.function( inputs=[lr], updates=param_update + clear_update, outputs=[T.sqrt(T.sum(T.sqr(w))) for w in deltas], on_unused_input='warn', # mode=theano.compile.MonitorMode(post_func=detect_nan) ) return acc, update
def build_model(hidden_size,predict_only=False): X = T.matrix('X') Y = T.ivector('Y') #* (0.001 * U.initial_weights(2,hidden_size) + np.array([[0,0,1,1],[1,1,0,0]]))) W_input_hidden = U.create_shared(U.initial_weights(2,hidden_size)) b_hidden = U.create_shared(U.initial_weights(hidden_size)) W_hidden_predict = U.create_shared(U.initial_weights(hidden_size,2)) b_predict = U.create_shared(U.initial_weights(2)) params = [W_input_hidden,b_hidden,W_hidden_predict,b_predict] hidden_lin = T.dot(X,W_input_hidden) + b_hidden hidden = T.nnet.sigmoid(hidden_lin) predict = T.nnet.softmax(T.dot(hidden,W_hidden_predict) + b_predict) cost = -T.mean(T.log(predict[T.arange(Y.shape[0]),Y])) + 1e-3*adjacency_constraint(hidden_lin)# + 1e-4 * sum(T.sum(p**2) for p in params) accuracy = T.mean(T.eq(T.argmax(predict,axis=1),Y)) grad = T.grad(cost,params) train = theano.function( inputs = [X,Y], #updates = updates.momentum(params,grad,0.9999,0.1) if not predict_only else None, #updates = updates.momentum(params,grad,0.999,0.0005), updates = updates.adadelta(params,grad), outputs = [accuracy,W_input_hidden,b_hidden,(hidden>0.5)] ) predict = theano.function( inputs = [X], outputs = predict[:,0] ) i = T.iscalar('i') hidden_p = theano.function( inputs = [X,i], outputs = hidden[:,i] ) return train,predict,hidden_p,params
def run_experiment(constraint_adj=False): X, output, cost, P = build_network(8, 3, constraint_adj) parameters = P.values() grads = T.grad(cost, wrt=parameters) train = theano.function(inputs=[X], outputs=cost, updates=updates.adadelta(parameters, grads)) test = theano.function( inputs=[X], outputs=output, ) data = np.eye(8, dtype=np.int8) # data = np.vstack((data,)) print "Training..." for _ in xrange(100000): np.random.shuffle(data) train(data) hidden_activations = theano.function( inputs=[X], outputs=T.nnet.sigmoid(T.dot(X, P.W_input_hidden) + P.b_hidden)) #print_arr(test(np.eye(8,dtype=np.int32))) #print_arr(1/(1 + np.exp(-parameters[0].get_value())),1) return hinton(hidden_activations(np.eye(8, dtype=np.int8)))
X,parameters,hidden,hidden1_reproduction,input_reproduction,unrolled = build_network(8,64) f = theano.function( inputs = [X], outputs = [hidden,hidden1_reproduction,input_reproduction,unrolled] ) error = build_error(X,hidden,hidden1_reproduction,input_reproduction) cost = error # + 1e-6*sum( T.sum(abs(p)) for p in parameters ) gradients = T.grad(cost,wrt=parameters) eps = T.dscalar('eps') mu = T.dscalar('mu') train = theano.function( inputs = [X,eps,mu], updates = updates.adadelta(parameters,gradients,mu,eps), outputs = error ) #example = np.vstack((np.eye(8),np.eye(8))) example = np.eye(8) error = 10 lr = 0.0001 t = 0 while error > 0.0001: np.random.shuffle(example) #error = train(example,lr,min(1 - 3.0/(t+5),0.999)) error = train(example,1e-6,0.95) #error = train(example,lr,0) print error t += 1
predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output) return X, predict def label_seq(string): idxs = font.indexify(string) result = np.ones((len(idxs) * 2 + 1, ), dtype=np.int32) * -1 result[np.arange(len(idxs)) * 2 + 1] = idxs print result return result if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') X, predict = build_model(P, X, 10, 10, 10) cost = ctc.cost(predict, Y) params = P.values() grad = T.grad(cost, wrt=params) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates.adadelta(params, grad)) for _ in xrange(10): print train( np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))
predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output) return X, predict def label_seq(string): idxs = font.indexify(string) result = np.ones((len(idxs) * 2 + 1,), dtype=np.int32) * -1 result[np.arange(len(idxs)) * 2 + 1] = idxs print result return result if __name__ == "__main__": P = Parameters() X = T.matrix('X') Y = T.ivector('Y') X, predict = build_model(P, X, 10, 10, 10) cost = ctc.cost(predict, Y) params = P.values() grad = T.grad(cost, wrt=params) train = theano.function( inputs=[X, Y], outputs=cost, updates=updates.adadelta(params, grad) ) for _ in xrange(10): print train(np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))