def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-4*l2 grads = [ T.clip(g,-10,10) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(params,grads,learning_rate = 1e-5) ) return P,train
def make_train(input_size,output_size,mem_size,mem_width,hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P,mem_size,mem_width,hidden_size,ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M,weights,output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + 1e-3*l2 # clip gradients grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] train = theano.function( inputs=[input_seq,output_seq], outputs=cost, updates=updates.adadelta(params,grads) ) return P,train
def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=T.sum(cross_entropy), updates=updates.adadelta(params, grads)) return P, train
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M, weights, output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + 1e-3 * l2 # clip gradients grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=cost, updates=updates.adadelta(params, grads)) return P, train
def make_model(input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') [M_curr, weights, output] = predict(input_seq) test_fun = theano.function(inputs=[input_seq], outputs=[weights, output]) return P, test_fun
def make_model( input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') [M_curr,weights,output] = predict(input_seq) test_fun = theano.function( inputs=[input_seq], outputs=[weights,output] ) return P,test_fun
def build(P , input_size , mem_width , weighted_mem_width , output_size) : ctrl = controller.build(P, input_size, output_size, weighted_mem_width) predict = model.build(P, input_size, mem_width, weighted_mem_width, ctrl) def turing_updates(cost , lr) : params = P.values() #whether add P weight decay l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() all_cost = cost + 1e-3 * l2 grads = [T.clip(g, -100, 100) for g in T.grad(all_cost, wrt=params)] return updates.rmsprop(params, grads, learning_rate=lr) def init_parameter(name , value) : P[name] = value #used by getvalue return turing_updates , predict
def build(P , mem_width , output_size) : ctrl = controller.build(P, mem_width, output_size, mem_width) predict = model.build(P, mem_width, ctrl) def turing_updates(cost , lr) : params = P.values() #whether add P weight decay l2 = T.sum(0) for p in params: l2 = l2 + (p ** 2).sum() all_cost = cost + 1e-3 * l2 clipper = updates.clip(5.) g = T.grad(all_cost, wrt=params) grads = clipper(g) return updates.momentum(params, grads, mu = 0, learning_rate=lr) def init_parameter(name , value) : P[name] = value #used by getvalue return turing_updates , predict
def make_model(input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_size=100): """ Given the model parameters, return a Theano function for the NTM's model """ P = Parameters() # Build the controller ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') [M_curr, weights, output] = predict(input_seq) # Return a Theano function for the NTM test_fun = theano.function(inputs=[input_seq], outputs=[weights, output]) return P, test_fun
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes) predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] response_length = input_seq.shape[0]/2 train = theano.function( inputs=[input_seq,output_seq], outputs=T.mean(cross_entropy[-response_length:]), updates=updates.adadelta(params,grads) ) return P,train
def make_model( input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_size=100): """ Given the model parameters, return a Theano function for the NTM's model """ P = Parameters() # Build the controller ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size) predict = model.build(P,mem_size,mem_width,hidden_size,ctrl) input_seq = T.matrix('input_sequence') [M_curr,weights,output] = predict(input_seq) # Return a Theano function for the NTM test_fun = theano.function( inputs=[input_seq], outputs=[weights,output] ) return P,test_fun
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate ,grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build( P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build( P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr,weights,output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output, output_seq),axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p ** 2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq,output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates = updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate ) ) self.predict_cost = theano.function( inputs=[input_seq,output_seq], outputs= cost ) self.predict = theano.function( inputs=[input_seq], outputs= [ weights, output] )
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate, grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build(P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build(P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr, weights, output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output, output_seq), axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq, output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates=updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate)) self.predict_cost = theano.function(inputs=[input_seq, output_seq], outputs=cost) self.predict = theano.function(inputs=[input_seq], outputs=[weights, output])