def test_single_var(self): # Test `is_same_graph` with some trivial graphs (one Variable). x, y, z = tensor.vectors('x', 'y', 'z') self.check([ (x, x, (({}, True), )), (x, y, (({}, False), ({y: x}, True), )), (x, tensor.neg(x), (({}, False), )), (x, tensor.neg(y), (({}, False), )), ])
def test_single_var(self): """ Test `is_same_graph` with some trivial graphs (one Variable). """ x, y, z = tensor.vectors('x', 'y', 'z') self.check([ (x, x, (({}, True), )), (x, y, (({}, False), ({y: x}, True), )), (x, tensor.neg(x), (({}, False), )), (x, tensor.neg(y), (({}, False), )), ])
def test_single_var(self): """ Test `is_same_graph` with some trivial graphs (one Variable). """ x, y, z = tensor.vectors("x", "y", "z") self.check( [ (x, x, (({}, True),)), (x, y, (({}, False), ({y: x}, True))), (x, tensor.neg(x), (({}, False),)), (x, tensor.neg(y), (({}, False),)), ] )
def test_single_var(self): # Test `is_same_graph` with some trivial graphs (one Variable). x, y, z = tensor.vectors("x", "y", "z") self.check([ (x, x, (({}, True), )), (x, y, ( ({}, False), ({ y: x }, True), )), (x, tensor.neg(x), (({}, False), )), (x, tensor.neg(y), (({}, False), )), ])
def negative_log_likelihood(self, y): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a matrix where 1 indicates which class the sample belongs to """ return T.mean(T.neg(y) * T.log(self.p_y_given_x) - (1+T.neg(y))*T.log(1-self.p_y_given_x)) + self.lambda_reg * T.sum(self.W ** 2)
def error(self, y): if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) if y.dtype.startwith("int"): return T.mean(T.neg(self.y_pred, y)) else: raise NotImplementedError
def minus_corr(u, v): um = T.sub(u, T.mean(u)) vm = T.sub(v, T.mean(v)) r_num = T.sum(T.mul(um, vm)) r_den = T.sqrt(T.mul(T.sum(T.sqr(um)), T.sum(T.sqr(vm)))) r = T.true_div(r_num, r_den) r = T.neg(r) return r
def objective(y_true, y_pred): active_notes = T.shape_padright(y_true[:, :, :, 0]) mask = T.concatenate([T.ones_like(active_notes), active_notes], axis=3) log_likelihoods = mask * T.log(2 * y_pred * y_true - y_pred - y_true + 1 + EPSILON) return T.neg(T.sum(log_likelihoods))
def negative_log_likelihood(self, y): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a matrix where 1 indicates which class the sample belongs to """ return T.mean( T.neg(y) * T.log(self.p_y_given_x) - (1 + T.neg(y)) * T.log(1 - self.p_y_given_x))
def dtw(i, q_p, b_p, Q, D, inf): i0 = T.eq(i, 0) # inf = T.cast(1e10,'float32') * T.cast(T.switch(T.eq(self.n,0), T.switch(T.eq(i,0), 0, 1), 1), 'float32') penalty = T.switch(T.and_(T.neg(n0), i0), big, T.constant(0.0, 'float32')) loop = T.constant(0.0, 'float32') + q_p forward = T.constant(0.0, 'float32') + T.switch(T.or_(n0, i0), 0, Q[i - 1]) opt = T.stack([loop, forward]) k_out = T.cast(T.argmin(opt, axis=0), 'int32') return opt[k_out, T.arange(opt.shape[1])] + D[i] + penalty, k_out
def get_loss(adjusted_output, prediction): epsilon = 1e-7 active_notes = T.shape_padright(adjusted_output[:, :, :, 0]) masks = T.concatenate([T.ones_like(active_notes), active_notes], axis=3) log_likelihoods = T.log(2 * prediction * adjusted_output - prediction - adjusted_output + 1 + epsilon) masked_log_likelihoods = masks * log_likelihoods return T.neg(T.sum(masked_log_likelihoods))
def loss_func(self, y_true, y_predict): active_notes = T.shape_padright(y_true[:, :, :, 0]) mask = T.concatenate([ T.ones_like(active_notes), active_notes, T.repeat(T.ones_like(active_notes), self.output_size - 2, -1) ], axis=-1) loglikelihoods = mask * T.log(2 * y_predict * y_true - y_predict - y_true + 1 + self.epsilon) return T.neg(T.sum(loglikelihoods))
def predict(self, y): # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', target.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.neg(self.y_pred) else: raise notimplementederror()
def loss(self, n_samples, regularization_strength, mix, mu, sigma): log_sum_loss = -tensor.sum(tensor.log( tensor.sum(mix * tensor.inv(np.sqrt(2 * np.pi) * sigma) * tensor.exp(tensor.neg(tensor.sqr(mu - self.target_vector)) * tensor.inv(2 * tensor.sqr(sigma))), axis=0) )) # reg_loss = tensor.sum(tensor.sqr(self.layers.values()[0].W)) # for layer in self.layers.values()[1:]: # reg_loss += tensor.sum(tensor.sqr(layer.W)) # regularization = 1/n_samples * regularization_strength/2 * reg_loss return log_sum_loss #+ regularization
def loss(self, n_samples, regularization_strength, mix, mu, sigma): log_sum_loss = -tensor.sum( tensor.log( tensor.sum( mix * tensor.inv(np.sqrt(2 * np.pi) * sigma) * tensor.exp( tensor.neg(tensor.sqr(mu - self.target_vector)) * tensor.inv(2 * tensor.sqr(sigma))), axis=0))) # reg_loss = tensor.sum(tensor.sqr(self.layers.values()[0].W)) # for layer in self.layers.values()[1:]: # reg_loss += tensor.sum(tensor.sqr(layer.W)) # regularization = 1/n_samples * regularization_strength/2 * reg_loss return log_sum_loss #+ regularization
def compute_loss(probs, absolute_melody, extra_info=False): """ Compute loss between probs and an absolute melody Parameters: probs: A theano tensor of shape (batch, time, 2+high_bound-low_bound) absolute_melody: A tensor of shape (batch, time) with correct indices extra_info: If True, return extra info Returns A theano tensor loss value. Also, if extra_info is true, an additional info dict. """ n_batch, n_time, prob_width = probs.shape correct_encoded_form = T.reshape( T.extra_ops.to_one_hot(T.flatten(absolute_melody), prob_width), probs.shape) loglikelihoods = T.log(probs + constants.EPSILON) * correct_encoded_form full_loss = T.neg(T.sum(loglikelihoods)) if extra_info: loss_per_timestep = full_loss / T.cast(n_batch * n_time, theano.config.floatX) accuracy_per_timestep = T.exp(-loss_per_timestep) loss_per_batch = full_loss / T.cast(n_batch, theano.config.floatX) accuracy_per_batch = T.exp(-loss_per_batch) num_jumps = T.sum(correct_encoded_form[:, :, 2:]) loss_per_jump = full_loss / T.cast(num_jumps, theano.config.floatX) accuracy_per_jump = T.exp(-loss_per_jump) return full_loss, { "loss_per_timestep": loss_per_timestep, "accuracy_per_timestep": accuracy_per_timestep, "loss_per_batch": loss_per_batch, "accuracy_per_batch": accuracy_per_batch, "loss_per_jump": loss_per_jump, "accuracy_per_jump": accuracy_per_jump } else: return full_loss
def logsum_loss(self, n_samples, l1_regularization_strength, l2_regularization_strength): log_sum_loss = -tensor.sum(tensor.log( tensor.sum(self.mix * tensor.inv(np.sqrt(2 * np.pi) * self.sigma) * tensor.exp(tensor.neg(tensor.sqr(self.mu - self.target_vector)) * tensor.inv(2 * tensor.sqr(self.sigma))), axis=0) )) l1_reg_loss = tensor.sum(np.abs(self.layers.values()[0].W)) for layer in self.layers.values()[1:]: l1_reg_loss += tensor.sum(np.abs(layer.W)) l2_reg_loss = tensor.sum(tensor.sqr(self.layers.values()[0].W)) for layer in self.layers.values()[1:]: l2_reg_loss += tensor.sum(tensor.sqr(layer.W)) l1_regularization = 1/n_samples * l1_regularization_strength/2 * l1_reg_loss l2_regularization = 1/n_samples * l2_regularization_strength/2 * l2_reg_loss return log_sum_loss + l1_regularization + l2_regularization
def compute_loss(probs, absolute_melody, extra_info=False): """ Compute loss between probs and an absolute melody Parameters: probs: A theano tensor of shape (batch, time, 2+high_bound-low_bound) absolute_melody: A tensor of shape (batch, time) with correct indices extra_info: If True, return extra info Returns A theano tensor loss value. Also, if extra_info is true, an additional info dict. """ n_batch, n_time, prob_width = probs.shape correct_encoded_form = T.reshape(T.extra_ops.to_one_hot(T.flatten(absolute_melody), prob_width), probs.shape) loglikelihoods = T.log( probs + constants.EPSILON )*correct_encoded_form full_loss = T.neg(T.sum(loglikelihoods)) if extra_info: loss_per_timestep = full_loss/T.cast(n_batch*n_time, theano.config.floatX) accuracy_per_timestep = T.exp(-loss_per_timestep) loss_per_batch = full_loss/T.cast(n_batch, theano.config.floatX) accuracy_per_batch = T.exp(-loss_per_batch) num_jumps = T.sum(correct_encoded_form[:,:,2:]) loss_per_jump = full_loss/T.cast(num_jumps, theano.config.floatX) accuracy_per_jump = T.exp(-loss_per_jump) return full_loss, { "loss_per_timestep":loss_per_timestep, "accuracy_per_timestep":accuracy_per_timestep, "loss_per_batch":loss_per_batch, "accuracy_per_batch":accuracy_per_batch, "loss_per_jump":loss_per_jump, "accuracy_per_jump":accuracy_per_jump } else: return full_loss
def logsum_loss(self, n_samples, l1_regularization_strength, l2_regularization_strength): log_sum_loss = -tensor.sum( tensor.log( tensor.sum(self.mix * tensor.inv( np.sqrt(2 * np.pi) * self.sigma) * tensor.exp( tensor.neg(tensor.sqr(self.mu - self.target_vector)) * tensor.inv(2 * tensor.sqr(self.sigma))), axis=0))) l1_reg_loss = tensor.sum(np.abs(self.layers.values()[0].W)) for layer in self.layers.values()[1:]: l1_reg_loss += tensor.sum(np.abs(layer.W)) l2_reg_loss = tensor.sum(tensor.sqr(self.layers.values()[0].W)) for layer in self.layers.values()[1:]: l2_reg_loss += tensor.sum(tensor.sqr(layer.W)) l1_regularization = 1 / n_samples * l1_regularization_strength / 2 * l1_reg_loss l2_regularization = 1 / n_samples * l2_regularization_strength / 2 * l2_reg_loss return log_sum_loss + l1_regularization + l2_regularization
def LMmulcloss(self,kth,x,y,label,nextwords): #multiple label loss + language model loss #nextwords = START + words + END hidden = self.hidden_k(x,self.w,self.dicw,kth) print "hidden type : "+str(hidden.type) size = y.ndim y = T.addbroadcast(y,size - 1) embedding = T.sum(hidden*y,0)/T.addbroadcast(T.cast(T.sum(y,0), 'int16'), size - 2) #embedding = T.sum(hidden*y,0)/ T.addbroadcast(T.sum(y,0), size-2) print "embedding type : "+str(embedding.type) mulloss = (0. - T.sum(T.log(1. / (1. + T.exp(0. - (T.dot(embedding, self.w["mulw"])+self.w["mulb"])*label)))))/embedding.shape[0] if self.hsoftmax: #pos language model hshape = self.hshape newhidden = hidden[:,:,:hidden.shape[2]/2].reshape((hidden.shape[0]*hidden.shape[1],hidden.shape[2]/2)) smax_group = T.nnet.h_softmax(newhidden, newhidden.shape[0],self.wordnum, hshape[0], hshape[1], self.w["posLMw1"], self.b["posLMb1"], self.w["posLMw2"], self.w["posLMb2"], nextwords[2:].ravel()) losslist = T.neg(T.log(smax_group.reshape(nextwords[2:].shape))) mask = T.cast(T.neq(nextwords[2:], self.padding_id), theano.config.floatX) losslist = losslist*mask posLMloss = T.cast(T.mean(T.sum(losslist,axis=0)), theano.config.floatX) #neg language model newhidden = hidden[:,:,hidden.shape[2]/2:].reshape((hidden.shape[0]*hidden.shape[1],hidden.shape[2]/2)) smax_group = T.nnet.h_softmax(newhidden, newhidden.shape[0],self.wordnum, hshape[0], hshape[1], self.w["negLMw1"], self.b["negLMb1"], self.w["negLMw2"], self.w["negLMb2"], nextwords[:-2].ravel()) losslist = T.neg(T.log(smax_group.reshape(nextwords[:-2].shape))) mask = T.cast(T.neq(nextwords[:-2], self.padding_id), theano.config.floatX) losslist = losslist*mask negLMloss = T.cast(T.mean(T.sum(losslist,axis=0)), theano.config.floatX) else: def categorical_loss(ihidden,words,w,b): scores = T.dot(ihidden,w)+b prep = T.exp(scores)/T.sum(T.exp(scores),1).dimshuffle(0,'x') loss = T.nnet.categorical_crossentropy(prep, words) return loss #newhidden = hidden.reshape((hidden.shape[0]*hidden.shape[1], hidden.shape[2])) #pos language model #prep = T.exp(T.dot(newhidden[:,:newhidden.shape[1]/2],self.w["posLMw"])+self.w["posLMb"])/T.sum(T.exp(T.dot(newhidden[:,:newhidden.shape[1]/2],self.w["posLMw"])+self.w["posLMb"]), 1).dimshuffle(0,'x') scores = T.dot(hidden[:,:,:hidden.shape[2]/2], self.w["posLMw"])+self.w["posLMb"] scores = scores.reshape((scores.shape[0]*scores.shape[1], scores.shape[2])) prep = T.exp(scores)/T.sum(T.exp(scores), scores.ndim - 1).dimshuffle((0,'x')) #(len*batch) losslist = T.nnet.categorical_crossentropy(prep,nextwords[2:].ravel()) losslist = losslist.reshape(nextwords[2:].shape) #losslist, _ = theano.scan(fn = categorical_loss, sequences = [hidden[:,:,:hidden.shape[2]/2], nextwords[2:]], outputs_info = None, # non_sequences = [self.w["posLMw"], self.w["posLMb"]]) mask = T.cast(T.neq(nextwords[2:], self.padding_id), theano.config.floatX) losslist = losslist*mask posLMloss = T.cast(T.mean(T.sum(losslist,axis=0)), theano.config.floatX) #neg language model #prep = T.exp(T.dot(newhidden[:,newhidden.shape[1]/2:],self.w["negLMw"])+self.w["negLMb"])/T.sum(T.exp(T.dot(newhidden[:,newhidden.shape[1]/2:],self.w["negLMw"])+self.w["negLMb"]), 1).dimshuffle(0,'x') scores = T.dot(hidden[:,:,hidden.shape[2]/2:], self.w["negLMw"])+self.w["negLMb"] scores = scores.reshape((scores.shape[0]*scores.shape[1], scores.shape[2])) prep = T.exp(scores)/T.sum(T.exp(scores), scores.ndim - 1).dimshuffle((0,'x')) #(len*batch) losslist = T.nnet.categorical_crossentropy(prep,nextwords[0:-2].ravel()) losslist = losslist.reshape(nextwords[0:-2].shape) #losslist, _ = theano.scan(fn = categorical_loss, sequences = [hidden[:,:,hidden.shape[2]/2:], nextwords[:-2]], outputs_info = None, # non_sequences = [self.w["negLMw"], self.w["negLMb"]]) mask = T.cast(T.neq(nextwords[0:-2], self.padding_id), theano.config.floatX) losslist = losslist*mask negLMloss = T.cast(T.mean(T.sum(losslist,axis=0)), theano.config.floatX) return mulloss, posLMloss, negLMloss
def setup_train(self): # dimensions: (batch, time, notes, input_data) with input_data as in architecture self.input_mat = T.btensor4() # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic self.output_mat = T.btensor4() self.epsilon = np.spacing(np.float32(1.0)) def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states # We generate an output for each input, so it doesn't make sense to use the last output as an input. # Note that we assume the sentinel start value is already present # TEMP CHANGE: NO SENTINEL input_slice = self.input_mat[:,0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape # time_inputs is a matrix (time, batch/note, input_per_note) time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn)) num_time_parallel = time_inputs.shape[1] # apply dropout if self.dropout > 0: time_masks = theano_lstm.MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout) else: time_masks = [] time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) self.time_thoughts = time_result # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (note, batch/time, hidden_states) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden)) # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note. # In (note, batch/time, 2) format # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2) start_note_values = T.alloc(np.array(0,dtype=np.int8), 1, time_final.shape[1], 2 ) correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2)) note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0) # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in # one direction are the steps in the other, and vice versa. note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 ) num_timebatch = note_inputs.shape[1] # apply dropout if self.dropout > 0: pitch_masks = theano_lstm.MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout) else: pitch_masks = [] note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (batch, time, note, onOrArticProb) note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3) # The cost of the entire procedure is the negative log likelihood of the events all happening. # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1 # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just # multiply the likelihoods, or, since we are logging them, add the logs. # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter # whether or not those are articulated. # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with # (b,x,y,1) instead active_notes = T.shape_padright(self.output_mat[:,1:,:,0]) mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3) loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon ) self.cost = T.neg(T.sum(loglikelihoods)) updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) self.update_thought_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost], allow_input_downcast=True)
def setup_train(self): # dimensions: (batch, time, notes, input_data) with input_data as in architecture self.input_mat = T.btensor4() # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic self.output_mat = T.btensor4() self.epsilon = np.spacing(np.float32(1.0)) print "model-setup-train::Trace-1" def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states # We generate an output for each input, so it doesn't make sense to use the last output as an input. # Note that we assume the sentinel start value is already present # TEMP CHANGE: NO SENTINEL print "model-setup-train::Trace-2" input_slice = self.input_mat[:,0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape # time_inputs is a matrix (time, batch/note, input_per_note) time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn)) num_time_parallel = time_inputs.shape[1] # apply dropout if self.dropout > 0: time_masks = MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout) else: time_masks = [] print "model-setup-train::Trace-3" time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) print "model-setup-train::Trace-4" self.time_thoughts = time_result # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (note, batch/time, hidden_states) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden)) # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note. # In (note, batch/time, 2) format # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2) start_note_values = T.alloc(0, 1, time_final.shape[1], 2 ) correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2)) note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0) print "model-setup-train::Trace-5" # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in # one direction are the steps in the other, and vice versa. note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 ) num_timebatch = note_inputs.shape[1] # apply dropout if self.dropout > 0: pitch_masks = MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout) else: pitch_masks = [] print "model-setup-train::Trace-6" note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (batch, time, note, onOrArticProb) note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3) print "model-setup-train::Trace-7" # The cost of the entire procedure is the negative log likelihood of the events all happening. # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1 # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just # multiply the likelihoods, or, since we are logging them, add the logs. # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter # whether or not those are articulated. # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with # (b,x,y,1) instead active_notes = T.shape_padright(self.output_mat[:,1:,:,0]) mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3) loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon ) print "model-setup-train::Trace-8" self.cost = T.neg(T.sum(loglikelihoods)) print "model-setup-train::Trace-9" updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") print "model-setup-train::Trace-10" self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) self.update_thought_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost], allow_input_downcast=True)
def LMmulcloss(self, kth, x, y, label, nextwords): #multiple label loss + language model loss #nextwords = START + words + END hidden = self.hidden_k(x, self.w, self.dicw, kth) print "hidden type : " + str(hidden.type) size = y.ndim y = T.addbroadcast(y, size - 1) embedding = T.sum(hidden * y, 0) / T.addbroadcast( T.cast(T.sum(y, 0), 'int16'), size - 2) #embedding = T.sum(hidden*y,0)/ T.addbroadcast(T.sum(y,0), size-2) print "embedding type : " + str(embedding.type) mulloss = (0. - T.sum( T.log(1. / (1. + T.exp( 0. - (T.dot(embedding, self.w["mulw"]) + self.w["mulb"]) * label)))) ) / embedding.shape[0] if self.hsoftmax: #pos language model hshape = self.hshape newhidden = hidden[:, :, :hidden.shape[2] / 2].reshape( (hidden.shape[0] * hidden.shape[1], hidden.shape[2] / 2)) smax_group = T.nnet.h_softmax(newhidden, newhidden.shape[0], self.wordnum, hshape[0], hshape[1], self.w["posLMw1"], self.b["posLMb1"], self.w["posLMw2"], self.w["posLMb2"], nextwords[2:].ravel()) losslist = T.neg(T.log(smax_group.reshape(nextwords[2:].shape))) mask = T.cast(T.neq(nextwords[2:], self.padding_id), theano.config.floatX) losslist = losslist * mask posLMloss = T.cast(T.mean(T.sum(losslist, axis=0)), theano.config.floatX) #neg language model newhidden = hidden[:, :, hidden.shape[2] / 2:].reshape( (hidden.shape[0] * hidden.shape[1], hidden.shape[2] / 2)) smax_group = T.nnet.h_softmax(newhidden, newhidden.shape[0], self.wordnum, hshape[0], hshape[1], self.w["negLMw1"], self.b["negLMb1"], self.w["negLMw2"], self.w["negLMb2"], nextwords[:-2].ravel()) losslist = T.neg(T.log(smax_group.reshape(nextwords[:-2].shape))) mask = T.cast(T.neq(nextwords[:-2], self.padding_id), theano.config.floatX) losslist = losslist * mask negLMloss = T.cast(T.mean(T.sum(losslist, axis=0)), theano.config.floatX) else: def categorical_loss(ihidden, words, w, b): scores = T.dot(ihidden, w) + b prep = T.exp(scores) / T.sum(T.exp(scores), 1).dimshuffle( 0, 'x') loss = T.nnet.categorical_crossentropy(prep, words) return loss #newhidden = hidden.reshape((hidden.shape[0]*hidden.shape[1], hidden.shape[2])) #pos language model #prep = T.exp(T.dot(newhidden[:,:newhidden.shape[1]/2],self.w["posLMw"])+self.w["posLMb"])/T.sum(T.exp(T.dot(newhidden[:,:newhidden.shape[1]/2],self.w["posLMw"])+self.w["posLMb"]), 1).dimshuffle(0,'x') scores = T.dot(hidden[:, :, :hidden.shape[2] / 2], self.w["posLMw"]) + self.w["posLMb"] scores = scores.reshape( (scores.shape[0] * scores.shape[1], scores.shape[2])) prep = T.exp(scores) / T.sum(T.exp(scores), scores.ndim - 1).dimshuffle((0, 'x')) #(len*batch) losslist = T.nnet.categorical_crossentropy(prep, nextwords[2:].ravel()) losslist = losslist.reshape(nextwords[2:].shape) #losslist, _ = theano.scan(fn = categorical_loss, sequences = [hidden[:,:,:hidden.shape[2]/2], nextwords[2:]], outputs_info = None, # non_sequences = [self.w["posLMw"], self.w["posLMb"]]) mask = T.cast(T.neq(nextwords[2:], self.padding_id), theano.config.floatX) losslist = losslist * mask posLMloss = T.cast(T.mean(T.sum(losslist, axis=0)), theano.config.floatX) #neg language model #prep = T.exp(T.dot(newhidden[:,newhidden.shape[1]/2:],self.w["negLMw"])+self.w["negLMb"])/T.sum(T.exp(T.dot(newhidden[:,newhidden.shape[1]/2:],self.w["negLMw"])+self.w["negLMb"]), 1).dimshuffle(0,'x') scores = T.dot(hidden[:, :, hidden.shape[2] / 2:], self.w["negLMw"]) + self.w["negLMb"] scores = scores.reshape( (scores.shape[0] * scores.shape[1], scores.shape[2])) prep = T.exp(scores) / T.sum(T.exp(scores), scores.ndim - 1).dimshuffle((0, 'x')) #(len*batch) losslist = T.nnet.categorical_crossentropy(prep, nextwords[0:-2].ravel()) losslist = losslist.reshape(nextwords[0:-2].shape) #losslist, _ = theano.scan(fn = categorical_loss, sequences = [hidden[:,:,hidden.shape[2]/2:], nextwords[:-2]], outputs_info = None, # non_sequences = [self.w["negLMw"], self.w["negLMb"]]) mask = T.cast(T.neq(nextwords[0:-2], self.padding_id), theano.config.floatX) losslist = losslist * mask negLMloss = T.cast(T.mean(T.sum(losslist, axis=0)), theano.config.floatX) return mulloss, posLMloss, negLMloss
def expit(v): return tt.inv(1. + tt.exp(tt.neg(v)))
def neg(x): return T.neg(x)