def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.lstm_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_cost, updates=updates, allow_input_downcast=True) updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.turing_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_turing, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), allow_input_downcast=True) all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True) all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr) updates_all = all_updates_lstm for pair in all_updates_turing_temp : updates_all[pair[0]] = pair[1] self.all_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_all, allow_input_downcast=True)
def _initialize_update_function(self): def time_step(input, *previous_hidden_state): return self.time_model.forward(input, prev_hiddens=previous_hidden_state) def note_step(input, *previous_hidden_state): return self.note_model.forward(input, prev_hiddens=previous_hidden_state) input = T.btensor4() adjusted_input = input[:, :-1] output = T.btensor4() adjusted_output = output[:, 1:] time_model_input = self.get_time_model_input(adjusted_input) time_model_outputs_info = self.get_outputs_info(time_model_input, self.time_model.layers) time_model_output = self.get_output(time_step, time_model_input, time_model_outputs_info) note_model_input = self.get_note_model_input(adjusted_input, adjusted_output, time_model_output) note_outputs_info = self.get_outputs_info(note_model_input, self.note_model.layers) note_model_output = self.get_output(note_step, note_model_input, note_outputs_info) prediction = self.get_prediction(adjusted_input, note_model_output) loss = self.get_loss(adjusted_output, prediction) updates, _, _, _, _ = create_optimization_updates(loss, self.params) self.update = theano.function(inputs=[input, output], outputs=loss, updates=updates, allow_input_downcast=True)
def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.cost, updates=updates, allow_input_downcast=True)
def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.steps], outputs=self.cost, updates=updates, allow_input_downcast=True)
def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, lr=0.01, method="adagrad")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.x, self.target0,self.target1,self.steps], outputs=self.cost, updates=updates, name='update_fun', profile=False, allow_input_downcast=True)
def _generate_train_model_function(self): u = T.lvector('u') i = T.lvector('i') ni = T.lvector('ni') j = T.lvector('j') nj = T.lvector('nj') self.W = theano.shared(numpy.random.random( (self._n_users, self._rank)).astype('float32'), name='W') self.H = theano.shared(numpy.random.random( (self._n_items, self._rank)).astype('float32'), name='H') self.B = theano.shared(numpy.zeros( self._n_items).astype('float32'), name='B') x_ui = T.dot(self.W[u], self.H[i].T).diagonal() + self.B[i] x_uni = T.dot(self.W[u], self.H[ni].T).diagonal() + self.B[ni] x_uj = T.dot(self.W[u], self.H[j].T).diagonal() + self.B[j] x_unj = T.dot(self.W[u], self.H[nj].T).diagonal() + self.B[nj] obj = T.log(T.nnet.sigmoid(x_ui - x_uni)) + T.log(T.nnet.sigmoid(x_uj - x_unj)) l2 = ((self.W[u] ** 2).sum(axis=1) + (self.H[i] ** 2).sum(axis=1) + (self.H[j] ** 2).sum(axis=1) + (self.H[ni] ** 2).sum(axis=1) + (self.H[nj] ** 2).sum(axis=1) + (self.B[i] ** 2 + self.B[j] ** 2 + self.B[ni] ** 2 + self.B[nj] ** 2)) cost = - T.sum(obj - self._lambda * l2) g_cost_W = T.grad(cost=cost, wrt=self.W) g_cost_H = T.grad(cost=cost, wrt=self.H) g_cost_B = T.grad(cost=cost, wrt=self.B) sgd_updates = [(self.W, self.W - self._learning_rate * g_cost_W), (self.H, self.H - self._learning_rate * g_cost_H), (self.B, self.B - self._learning_rate * g_cost_B)] self.train_sgd = theano.function( inputs=[u, i, ni, j, nj], outputs=cost, updates=sgd_updates) ada_updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates( cost, [self.W, self.H, self.B], method="adadelta") self.train_ada = theano.function( inputs=[u, i, ni, j, nj], outputs=cost, updates=ada_updates, on_unused_input='warn') return True
def lstmTrain(examples,labels,input_size,num_iterations,steps,saveto=""): print examples,labels # Make a dataset where the network should learn whether the number 1 has been seen yet in the first column of # the input sequence. This probably isn't really a good example use case for an LSTM, but it's simple. '''rng = np.random.RandomState(123456789) input_size = 2 input_length = 3 sample_size = 500 num_iterations = 1 examples = rng.choice([0,1], (1, input_length,2)).astype(theano.config.floatX) #labels = np.array([[1 if np.sum(np.abs(x[:y + 1])) > 5 else 0 for y in range(len(x))] # for x in examples], # dtype=theano.config.floatX) labels = np.array([[[1,0,1]]], dtype=theano.config.floatX)''' hidden_layer_size = 10 num_hidden_layers = 2 nodes=len(labels) assert len(labels)==len(examples) model = StackedCells(input_size, layers=[20,nodes], activation=T.tanh, celltype=LSTM) # Make the connections from the input to the first layer have linear activations. model.layers[0].in_gate2.activation = lambda x: x # Add an output layer to predict the labels for each time step. output_layer = Layer(nodes, nodes,lambda x: T.nnet.softmax(x)[0]) model.layers.append(output_layer) #model.layers.append(Layer(3, 3, lambda x: T.nnet.softmax(x)[0])) #tensor.nnet.softmax(x) #pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) #softmax_layer = Layer(3, 3, T.nnet.sigmoid) #softmax_layer.activation = lambda x: T.nnet.softmax(x) #model.layers.append(softmax_layer) #pred = T.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) def step(x, *prev_hiddens): activations = model.forward(x, prev_hiddens=prev_hiddens) return activations input_vec = T.matrix('input_vec') #input_mat=np.zeros((3,2)) #input_mat=input_vec.dimshuffle((0,'x',1)) #input_mat = input_vec.dimshuffle((0,'x')).eval({input_vec:examples[0]}) #print input_mat result, _ = theano.scan(fn=step, sequences=[input_vec], #outputs_info=([dict(initial=input_vec, taps=[-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]), outputs_info=([dict(initial=hidden_layer.initial_hidden_state) for hidden_layer in model.layers[:-1]] +[dict(initial=model.layers[-1].bias_matrix)]), #[dict(initial=T.zeros_like(model.layers[-1].bias_matrix), taps=[-1])]), n_steps=steps) #print result[0].eval({input_vec:examples[0]}) #print model.layers[-1].eval({input_vec:examples[0]}) #print result[-1].eval({input_vec:examples[0]}) #print result[-1].T[0].eval({input_vec:examples[0]}) #target = T.vector('target') target=T.vector('target',dtype='int64') prediction = result[-1]#.T[1]#.eval({examples:rng.choice([0,1], (1, input_length,2)).astype(theano.config.floatX),input_mat:np.zeros((3,2))}) #cost = T.nnet.binary_crossentropy(prediction, target).mean() #pred = T.nnet.softmax(prediction) #print 'predict' #print pred.eval({input_vec:examples[0]}) cost=-T.log(prediction[target] + 1e-8).mean() updates, _, _, _, _ = create_optimization_updates(cost, model.params) update_func = theano.function([input_vec, target], cost, updates=updates, allow_input_downcast=True,on_unused_input='warn') predict_func = theano.function([input_vec], prediction, allow_input_downcast=True,on_unused_input='warn') for cur_iter in range(num_iterations): for i, (example, label) in enumerate(zip(examples, labels)): #print i,example,label c = update_func(example, label) print "cost",c #create_optimization_updates(cost, model.params) #if i % 100 == 0: # print "."#, end #print() if saveto: np.savez(saveto, model.params) '''test_cases = [np.array([[-1,1], [1,2],[0,0], [1,3], [2,-2]], dtype=theano.config.floatX)]
def main(): # Make a dataset where the network should learn whether the number 1 has been seen yet in the first column of # the input sequence. This probably isn't really a good example use case for an LSTM, but it's simple. rng = np.random.RandomState(123456789) input_size = 1 input_length = 2 sample_size = 1 num_iterations = 3 examples = rng.choice([-2, -1, 0, 1, 2], (sample_size, input_length)).astype(theano.config.floatX) labels = np.array([[1 if np.sum(np.abs(x[:y + 1])) > 5 else 0 for y in range(len(x))] for x in examples], dtype=theano.config.floatX) hidden_layer_size = 10 num_hidden_layers = 2 model = StackedCells(input_size, layers=[hidden_layer_size] * num_hidden_layers, activation=T.tanh, celltype=LSTM) # Make the connections from the input to the first layer have linear activations. model.layers[0].in_gate2.activation = lambda x: x # Add an output layer to predict the labels for each time step. output_layer = Layer(hidden_layer_size, 1, T.nnet.sigmoid) model.layers.append(output_layer) def step(x, *prev_hiddens): activations = model.forward(x, prev_hiddens=prev_hiddens) return activations input_vec = T.vector('input_vec') input_mat = input_vec.dimshuffle((0, 'x')) result, _ = theano.scan(fn=step, sequences=[input_mat], outputs_info=([dict(initial=hidden_layer.initial_hidden_state, taps=[-1]) for hidden_layer in model.layers[:-1]] + [dict(initial=T.zeros_like(model.layers[-1].bias_matrix), taps=[-1])])) print result[-2].eval({input_vec:examples[0]}) target = T.vector('target') prediction = result[-1].T[0] cost = T.nnet.binary_crossentropy(prediction, target).mean() updates, _, _, _, _ = create_optimization_updates(cost, model.params) update_func = theano.function([input_vec, target], cost, updates=updates, allow_input_downcast=True) predict_func = theano.function([input_vec], prediction, allow_input_downcast=True) for cur_iter in range(num_iterations): for i, (example, label) in enumerate(zip(examples, labels)): c = update_func(example, label) #if i % 100 == 0: #print(".") print() test_cases = [np.array([-1, 1, 0, 1, -2, 0, 1, 0, 2, 0], dtype=theano.config.floatX), np.array([2, 2, 2, 0, 0, 0], dtype=theano.config.floatX), np.array([-2, -2, -2, 0, 0, 0], dtype=theano.config.floatX), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0], dtype=theano.config.floatX), np.array([2, 0, 0, 0, 2, 0, 0, 0, 0, -2, 0, 0, 0, 0, 0], dtype=theano.config.floatX), np.array([2, 2, 2, 0, 0, 0, 2, 2, 2, 0], dtype=theano.config.floatX)] for example in test_cases: print("input", "output") for x, pred in zip(example, predict_func(example)): print(x, "{:.3f}".format(pred)) print()
def _generate_train_model_function(self): """ Generates the train model function in Theano. This is a straight port of the objective function described in the BPR paper. We want to learn a matrix factorisation U = W.H^T where U is the user-item matrix, W is a user-factor matrix and H is an item-factor matrix, so that it maximises the difference between W[u,:].H[i,:]^T and W[u,:].H[j,:]^T, where `i` is a positive item (one the user `u` has watched) and `j` a negative item (one the user `u` hasn't watched). """ u = T.lvector('u') i = T.lvector('i') j = T.lvector('j') self.W = theano.shared(numpy.random.random( (self._n_users, self._rank)).astype('float32'), name='W') self.H = theano.shared(numpy.random.random( (self._n_items, self._rank)).astype('float32'), name='H') self.B = theano.shared(numpy.zeros(self._n_items).astype('float32'), name='B') x_ui = T.dot(self.W[u], self.H[i].T).diagonal() + self.B[i] x_uj = T.dot(self.W[u], self.H[j].T).diagonal() + self.B[j] # x_ui = T.dot(self.W[u], self.H[i].T).diagonal() # x_uj = T.dot(self.W[u], self.H[j].T).diagonal() x_uij = x_ui - x_uj obj_uij = T.sum( T.log(T.nnet.sigmoid(x_uij)) - self._lambda_u * (self.W[u]**2).sum(axis=1) - self._lambda_i * (self.H[i]**2).sum(axis=1) - self._lambda_j * (self.H[j]**2).sum(axis=1) - self._lambda_bias * (self.B[i]**2 + self.B[j]**2)) cost = -obj_uij g_cost_W = T.grad(cost=cost, wrt=self.W) g_cost_H = T.grad(cost=cost, wrt=self.H) g_cost_B = T.grad(cost=cost, wrt=self.B) sgd_updates = [(self.W, self.W - self._learning_rate * g_cost_W), (self.H, self.H - self._learning_rate * g_cost_H), (self.B, self.B - self._learning_rate * g_cost_B)] self.train_sgd = theano.function(inputs=[u, i, j], outputs=cost, updates=sgd_updates) ada_updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates( cost, [self.W, self.H, self.B], method="adadelta") self.train_ada = theano.function(inputs=[u, i, j], outputs=cost, updates=ada_updates)
def setup_train(self): # dimensions: (batch, time, notes, input_data) with input_data as in architecture self.input_mat = T.btensor4() # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic self.output_mat = T.btensor4() self.epsilon = np.spacing(np.float32(1.0)) def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states # We generate an output for each input, so it doesn't make sense to use the last output as an input. # Note that we assume the sentinel start value is already present # TEMP CHANGE: NO SENTINEL input_slice = self.input_mat[:,0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape # time_inputs is a matrix (time, batch/note, input_per_note) time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn)) num_time_parallel = time_inputs.shape[1] # apply dropout if self.dropout > 0: time_masks = theano_lstm.MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout) else: time_masks = [] time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) self.time_thoughts = time_result # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (note, batch/time, hidden_states) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden)) # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note. # In (note, batch/time, 2) format # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2) start_note_values = T.alloc(np.array(0,dtype=np.int8), 1, time_final.shape[1], 2 ) correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2)) note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0) # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in # one direction are the steps in the other, and vice versa. note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 ) num_timebatch = note_inputs.shape[1] # apply dropout if self.dropout > 0: pitch_masks = theano_lstm.MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout) else: pitch_masks = [] note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (batch, time, note, onOrArticProb) note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3) # The cost of the entire procedure is the negative log likelihood of the events all happening. # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1 # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just # multiply the likelihoods, or, since we are logging them, add the logs. # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter # whether or not those are articulated. # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with # (b,x,y,1) instead active_notes = T.shape_padright(self.output_mat[:,1:,:,0]) mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3) loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon ) self.cost = T.neg(T.sum(loglikelihoods)) updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) self.update_thought_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost], allow_input_downcast=True)
def setup_train(self): # dimensions: (batch, time, notes, input_data) with input_data as in architecture self.input_mat = T.btensor4() # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic self.output_mat = T.btensor4() self.epsilon = np.spacing(np.float32(1.0)) print "model-setup-train::Trace-1" def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states # We generate an output for each input, so it doesn't make sense to use the last output as an input. # Note that we assume the sentinel start value is already present # TEMP CHANGE: NO SENTINEL print "model-setup-train::Trace-2" input_slice = self.input_mat[:,0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape # time_inputs is a matrix (time, batch/note, input_per_note) time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn)) num_time_parallel = time_inputs.shape[1] # apply dropout if self.dropout > 0: time_masks = MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout) else: time_masks = [] print "model-setup-train::Trace-3" time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) print "model-setup-train::Trace-4" self.time_thoughts = time_result # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (note, batch/time, hidden_states) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden)) # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note. # In (note, batch/time, 2) format # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2) start_note_values = T.alloc(0, 1, time_final.shape[1], 2 ) correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2)) note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0) print "model-setup-train::Trace-5" # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in # one direction are the steps in the other, and vice versa. note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 ) num_timebatch = note_inputs.shape[1] # apply dropout if self.dropout > 0: pitch_masks = MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout) else: pitch_masks = [] print "model-setup-train::Trace-6" note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (batch, time, note, onOrArticProb) note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3) print "model-setup-train::Trace-7" # The cost of the entire procedure is the negative log likelihood of the events all happening. # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1 # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just # multiply the likelihoods, or, since we are logging them, add the logs. # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter # whether or not those are articulated. # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with # (b,x,y,1) instead active_notes = T.shape_padright(self.output_mat[:,1:,:,0]) mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3) loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon ) print "model-setup-train::Trace-8" self.cost = T.neg(T.sum(loglikelihoods)) print "model-setup-train::Trace-9" updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") print "model-setup-train::Trace-10" self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) self.update_thought_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost], allow_input_downcast=True)
def __init__(self, word_size, vocabulary_size, stack_size, hidden_size, hidden_price_size, price_stack_size, output_vocabulary, index2word, word2index, index2category, category2index, memory_sparsity = 0.0001, rho = 0.95, verbose=False, theano_mode = "FAST_RUN"): self.index2word = index2word self.word2index = word2index self.index2category = index2category self.category2index = category2index self.memory_sparsity= theano.shared(np.float64(memory_sparsity), name="memory_sparsity") self.theano_mode = theano_mode self.word_size = word_size self.vocabulary_size = theano.shared(np.int32(vocabulary_size), name="vocabulary_size") self.stack_size = stack_size self.hidden_size = hidden_size self.output_vocabulary = output_vocabulary ### CREATE THE CELLS: model = theano_lstm.StackedCells(word_size, layers=[hidden_size] * stack_size, celltype=theano_lstm.LSTM, activation=T.tanh) # add a softmax layer at the end (non-recurrent) # special end token: model.layers.append(theano_lstm.Layer(hidden_size, output_vocabulary + 1, to_softmax)) # add an embedding layer at the beginning (non-recurrent): model.layers = [theano_lstm.Embedding(vocabulary_size + output_vocabulary + 1, word_size), theano_lstm.GatedInput(word_size, hidden_size, T.nnet.sigmoid)] + model.layers self.model = model model2 = theano_lstm.StackedCells(hidden_size, layers=[hidden_price_size] * (price_stack_size - 1) + [1], celltype=theano_lstm.Layer, activation=T.tanh) # price is a linear function of its inputs: model2.layers[-1].activation = T.exp self.price_model = model2 ### CONSTRUCT THE PREDICTION / WIRING: def step(word_id, *prev_hiddens): if prev_hiddens[-1].ndim > 1: top_level_activ = prev_hiddens[-1][:, self.hidden_size:] else: top_level_activ = prev_hiddens[-1][self.hidden_size:] new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), []) # all outputs should be returned, except embeddings, and the first gates return new_state[1:] def pred_step(word_id, *prev_hiddens): if prev_hiddens[-1].ndim > 1: top_level_activ = prev_hiddens[-1][:, self.hidden_size:] else: top_level_activ = prev_hiddens[-1][self.hidden_size:] new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), []) # all outputs should be returned, except embeddings, and the first gates return [T.cast(new_state[-1].argmax() + self.vocabulary_size, dtype='int32')] + new_state[2:-1] def predict_sequence(x, lengths, return_all=False, return_memory=False): if x.ndim > 1: outputs_info = [None] + [dict(initial=T.repeat(T.shape_padleft(layer.initial_hidden_state), x.shape[0], axis=0), taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')] else: outputs_info = [None] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')] outputs_info = outputs_info + [None] result, updates = theano.scan(step, sequences = [x.T if x.ndim > 1 else x], outputs_info = outputs_info) if return_all: return result else: res = result[-1].dimshuffle(1, 0, 2) if x.ndim > 1 else result[-1] price_preds = self.price_model.forward( self.model.layers[-2].postprocess_activation( result[-2][lengths, T.arange(0, lengths.shape[0])] ), None, [] )[-1][:,0] if x.ndim > 1 else \ self.price_model.forward( self.model.layers[-2].postprocess_activation( result[-2][-1] ), None, [])[-1][0] # gate values can be obtained by asking for them from the stacked cells if return_memory: return result[0], res, price_preds else: return res, price_preds # every sequence is a series of indices # for words: input_sentences = T.imatrix() # some sequences are shorter than others, so we'll note where they # end in a zero-indexed fashion sequence_lengths = T.ivector() sequence_starts = T.ivector() # the labels are integers in the range of dictionary self.input_sentences = input_sentences self.sequence_lengths = sequence_lengths self.sequence_starts = sequence_starts self.prices = T.vector() memory_usage, self.predictions, self.price_predictions = predict_sequence(input_sentences, self.sequence_starts, return_memory=True) self.error = ( theano_lstm.masked_loss( self.predictions, input_sentences[:,1:] - self.vocabulary_size, sequence_lengths, sequence_starts).mean() + (memory_usage.sum() * self.memory_sparsity) / input_sentences.shape[0] + ((self.price_predictions - self.prices)**2).mean() ) self.memory_fun = theano.function([input_sentences], memory_usage, allow_input_downcast=True, mode=self.theano_mode) self.price_predict_fun = theano.function([input_sentences, sequence_starts], self.price_predictions, allow_input_downcast=True, mode=self.theano_mode) self.predict_fun = theano.function([input_sentences], self.predictions, allow_input_downcast=True, mode=self.theano_mode) self.error_fun = theano.function([input_sentences, sequence_lengths, sequence_starts, self.prices], self.error, allow_input_downcast=True, mode=self.theano_mode) self.input_sentence = T.ivector() prep_result = predict_sequence(self.input_sentence, None, return_all=True) pred_outputs_info = [dict(initial=self.input_sentence[-1], taps=[-1])] + [dict(initial=prep_hidden[-1], taps=[-1]) for prep_hidden in prep_result[1:-1]] prediction_steps = T.iscalar() pred_result, _ = theano.scan(pred_step, n_steps = prediction_steps, outputs_info = pred_outputs_info) self.reconstruct_fun = theano.function([self.input_sentence, prediction_steps], pred_result[0], allow_input_downcast=True, mode=self.theano_mode) self.input_labels = theano.function([input_sentences], input_sentences[:,1:] - self.vocabulary_size, mode=self.theano_mode) if verbose: print("created prediction & error functions") updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(self.error, model.params + model2.params, max_norm=None, rho=rho, method="adadelta") self.lr = lr if verbose: print("took the gradient") self.gsums = gsums self.xsums = xsums self.update_fun = theano.function([input_sentences, sequence_lengths, sequence_starts, self.prices], outputs=None, updates=updates, mode=self.theano_mode) if verbose: print("created the gradient descent function")
def setup_train(self): print('{:25}'.format("Setup Train"), end='', flush=True) self.input_mat = T.btensor4() self.output_mat = T.btensor4() def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def get_dropout(layers, num_time_parallel=1): if self.dropout > 0: return theano_lstm.MultiDropout([(num_time_parallel, shape) for shape in layers], self.dropout) else: return [] # TIME PASS input_slice = self.input_mat[:, 0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape time_inputs = input_slice.transpose((1, 0, 2, 3)).reshape( (n_time, n_batch * n_note, n_ipn)) time_masks = get_dropout(self.t_layer_sizes, time_inputs.shape[1]) time_outputs_info = [ initial_state_with_taps(layer, time_inputs.shape[1]) for layer in self.time_model.layers ] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) self.time_thoughts = time_result last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape( (n_time, n_batch, n_note, n_hidden)).transpose( (2, 1, 0, 3)).reshape((n_note, n_batch * n_time, n_hidden)) # PITCH PASS start_note_values = T.alloc(np.array(0, dtype=np.int8), 1, time_final.shape[1], self.output_size) correct_choices = self.output_mat[:, 1:, 0:-1, :].transpose( (2, 0, 1, 3)).reshape( (n_note - 1, n_batch * n_time, self.output_size)) note_choices_inputs = T.concatenate( [start_note_values, correct_choices], axis=0) note_inputs = T.concatenate([time_final, note_choices_inputs], axis=2) note_masks = get_dropout(self.p_layer_sizes, note_inputs.shape[1]) note_outputs_info = [ initial_state_with_taps(layer, note_inputs.shape[1]) for layer in self.pitch_model.layers ] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=note_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result note_final = get_last_layer(note_result).reshape( (n_note, n_batch, n_time, self.output_size)).transpose(1, 2, 0, 3) self.cost = self.loss_func(self.output_mat[:, 1:], note_final) updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) print("Done")
def __init__(self, hidden_size, internal_features, intermediate_size, vocab_size, num_answers, tensor=True, method="sgd"): self.text_embedding = Embedding(vocab_size, hidden_size) self.question_embedding = Embedding(vocab_size, hidden_size) self.answer_embedding = Embedding(vocab_size, hidden_size) self.params = self.text_embedding.params + self.question_embedding.params + self.answer_embedding.params self.tensor = tensor if tensor: self.q_form_U = create_shared("question_answer_tensor", intermediate_size, internal_features, 3 * hidden_size) self.q_form_V = create_shared("question_answer_tensor", intermediate_size, internal_features, 3 * hidden_size) self.params.append(self.q_form_U) self.params.append(self.q_form_V) # here are the affine parameters self.bias = create_shared("bias", intermediate_size) self.projection_mat = create_shared("projection_mat", intermediate_size, 3 * hidden_size) self.scoring_mat = create_shared("scoring_mat", 1, intermediate_size) self.params += [ self.bias, self.projection_mat ] # create a triplet scoring function: sentence = T.ivector() question = T.ivector() answer = T.ivector() self.score_triplet = theano.function([sentence, question, answer], self.get_score(sentence, question, answer), allow_input_downcast=True) # create an error function answers = [T.ivector() for i in range(num_answers)] targets = [T.fscalar() for i in range(num_answers)] answer_targets = [] for a, t in zip(answers, targets): answer_targets.extend([a, t]) error = self.get_error( sentence, question, *answer_targets) self.error_fun = theano.function([ sentence, question] + answer_targets, error, allow_input_downcast=True) gparams = T.grad(error, self.params, disconnected_inputs='ignore') updates = OrderedDict() self.gradient_caches = [theano.shared(param.get_value(True, True) * 0.0, borrow=True, name=param.name + "_grad") for param in self.params] for gparam_cache, gparam in zip(self.gradient_caches, gparams): updates[gparam_cache] = gparam_cache + gparam self.update_gradient = theano.function([ sentence, question] + answer_targets, error, updates=updates, allow_input_downcast=True) # create a training function: true_updates, self.gsums, self.xsums, lr, max_norm = create_optimization_updates( None, self.params, method=method, gradients=self.gradient_caches ) self.lr = lr for gparam_cache in self.gradient_caches: true_updates[gparam_cache] = T.zeros_like(gparam_cache) self.apply_gradient = theano.function( inputs = [], outputs = [], updates = true_updates)
def __init__(self, hidden_size, vocab_size, num_answers): self.embedding = Embedding(vocab_size, hidden_size) self.q_form = create_shared("tensor", 1, hidden_size, hidden_size) self.params = self.embedding.params + [self.q_form] # create a triplet scoring function: sentence = T.ivector() question = T.ivector() answer = T.ivector() self.score_triplet = theano.function([sentence, question, answer], self.get_score(sentence, question, answer), allow_input_downcast=True) # create an error function answers = [T.ivector() for i in range(num_answers)] targets = [T.fscalar() for i in range(num_answers)] answer_targets = [] for a, t in zip(answers, targets): answer_targets.extend([a, t]) error = self.get_error( sentence, question, *answer_targets) self.error_fun = theano.function([ sentence, question] + answer_targets, error, allow_input_downcast=True) gparams = T.grad(error, self.params) updates = OrderedDict() self.gradient_caches = [theano.shared(param.get_value(True, True) * 0.0, borrow=True, name=param.name + "_grad") for param in self.params] for gparam_cache, gparam in zip(self.gradient_caches, gparams): updates[gparam_cache] = gparam_cache + gparam self.update_gradient = theano.function([ sentence, question] + answer_targets, error, updates=updates, allow_input_downcast=True) # create a training function: true_updates, gsums, xsums, lr, max_norm = create_optimization_updates( None, self.params, method="sgd", gradients=self.gradient_caches ) self.lr = lr for gparam_cache in self.gradient_caches: true_updates[gparam_cache] = T.zeros_like(gparam_cache) self.apply_gradient = theano.function( inputs = [], outputs = [], updates = true_updates)