def pick_neg_log(self, pred, gold): # TODO make this a static function in both classes if not isinstance(gold, int) and not isinstance(gold, np.int64): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def predict(self, feature_vector, task_ids, train=False, soft_labels=False, temperature=None, dropout_rate=0.0, orthogonality_weight=0.0, domain_id=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) outputs = [] for task_id in task_ids: output = self.output_layers_dict[task_id](input, soft_labels=soft_labels, temperature=temperature) outputs.append(output) constraint, adv_loss = 0, 0 if orthogonality_weight != 0: # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP F0_layer = self.output_layers_dict["F0"] F1_layer = self.output_layers_dict["F1"] F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W F0_W = dynet.parameter(F0_param) F1_W = dynet.parameter(F1_param) # calculate the matrix product of the task matrix with both others matrix_product = dynet.transpose(F0_W) * F1_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) constraint += squared_frobenius_norm # print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient(input) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) # print('Adversarial loss:', avg_adv_loss.value()) return outputs, constraint, adv_loss
def get_coverage(self, a_t, prev_coverage, training=True): if not self.coverage: if not training: return None return dy.scalarInput(0), None coverage = a_t + prev_coverage if training: return ( dy.sum_elems(dy.min_dim(dy.concatenate([a_t, coverage], d=1), d=1)), coverage, ) return coverage
def __call__(self, x, soft_labels=False, temperature=None, train=False): if self.mlp: act = self.mlp_activation x_in = act(self.W_mlp * x + self.b_mlp) else: x_in = x logits = self.W*x_in + self.b if soft_labels and temperature: # calculate the soft labels smoothed with the temperature # see Distilling the Knowledge in a Neural Network elems = dynet.exp(logits / temperature) return dynet.cdiv(elems, dynet.sum_elems(elems)) if self.act: return self.act(logits) return logits
def get_loss_and_prediction(self, input, targets, epsilon=1e-10): layers = self.compute_output_layer(input) #dim = layers[-1].dim()[0][0] #ts = np.ones(dim) #for t in targets: #ts[t] = 0 #e = dy.inputTensor(ts) #me = - e #last = dy.cmult(layers[-1], me) + e ys = dy.vecInput(self.dim_out) ys.set([1 if i in targets else 0 for i in range(self.dim_out)]) loss = dy.binary_log_loss(layers[-1], ys) output = layers[-1].value() res = {i for i, v in enumerate(output) if v > 0.5} return dy.sum_elems(loss), res
def __call__(self, x, soft_labels=False, temperature=None): if self.mlp: W_mlp = dynet.parameter(self.W_mlp) b_mlp = dynet.parameter(self.b_mlp) act = self.mlp_activation x_in = act(W_mlp * x + b_mlp) else: x_in = x # from params to expressions W = dynet.parameter(self.W) b = dynet.parameter(self.b) logits = (W * x_in + b) + dynet.scalarInput(1e-15) if soft_labels and temperature: # calculate the soft labels smoothed with the temperature # see Distilling the Knowledge in a Neural Network elems = dynet.exp(logits / temperature) return dynet.cdiv(elems, dynet.sum_elems(elems)) return self.act(logits)
def get_loss(self, input, targets, epsilon=1e-10): layers = self.compute_output_layer(input) #ts = np.ones(dim) #for t in targets: #ts[t] = 0 #print() #print("output", layers[-1].value()) #print(targets) #e = dy.inputTensor(ts) #me = - e #last = dy.cmult(layers[-1], me) + e #print("gradient", last.value()) #log_loss = dy.log(last + epsilon) #print(log_loss.value()) ys = dy.vecInput(self.dim_out) ys.set([1 if i in targets else 0 for i in range(self.dim_out)]) loss = dy.binary_log_loss(layers[-1], ys) return dy.sum_elems(loss)
def __call__(self, x, soft_labels=False, temperature=None, train=False): if self.mlp: W_mlp = dynet.parameter(self.W_mlp) b_mlp = dynet.parameter(self.b_mlp) act = self.mlp_activation x_in = act(W_mlp * x + b_mlp) else: x_in = x # from params to expressions W = dynet.parameter(self.W) b = dynet.parameter(self.b) logits = W*x_in + b if soft_labels and temperature: # calculate the soft labels smoothed with the temperature # see Distilling the Knowledge in a Neural Network elems = dynet.exp(logits / temperature) return dynet.cdiv(elems, dynet.sum_elems(elems)) if self.act: return self.act(logits) return logits
def pick_neg_log(self, pred, gold): if hasattr(gold, "__len__"): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def predict(self, word_indices, char_indices, task_id, train=False, soft_labels=False, temperature=None, orthogonality_weight=0.0, domain_id=None): """ predict tags for a sentence represented as char+word embeddings :param domain_id: Predict adversarial loss if domain id is provided. """ dynet.renew_cg() # new graph char_emb = [] rev_char_emb = [] wfeatures = [self.wembeds[w] for w in word_indices] if self.c_in_dim > 0: # get representation for words for chars_of_token in char_indices: char_feats = [self.cembeds[c] for c in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence( char_feats, char_feats) last_state = f_char[-1] rev_last_state = b_char[-1] char_emb.append(last_state) rev_char_emb.append(rev_last_state) features = [ dynet.concatenate([w, c, rev_c]) for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] output_expected_at_layer = self.h_layers output_expected_at_layer -= 1 # go through layers prev = features prev_rev = features num_layers = self.h_layers constraint = 0 adv_loss = 0 for i in range(0, num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence( prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] if i == output_expected_at_layer: concat_layer = [ dynet.concatenate([f, b]) for f, b in zip( forward_sequence, reversed(backward_sequence)) ] if train and self.noise_sigma > 0.0: concat_layer = [ dynet.noise(fe, self.noise_sigma) for fe in concat_layer ] if task_id not in ["src", "trg"]: output_predictor = self.predictors["output_layers_dict"][ task_id] output = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) else: # one src example for all three outputs output = [] # in this case it is a list for t_id in self.task_ids: output_predictor = self.predictors[ "output_layers_dict"][t_id] output_t = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) output.append(output_t) if orthogonality_weight != 0 and task_id != "Ft": # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP # use orthogonality_weight only between F0 and F1 builder = self.predictors["output_layers_dict"][ "F0"].network_builder task_param = builder.W_mlp if self.add_hidden else builder.W task_W = dynet.parameter(task_param) builder = self.predictors["output_layers_dict"][ "F1"].network_builder other_param = builder.W_mlp if self.add_hidden else builder.W other_task_W = dynet.parameter(other_param) # calculate the matrix product of the task matrix with the other matrix_product_1 = dynet.transpose(task_W) * other_task_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product_1)) constraint = squared_frobenius_norm #print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient( concat_layer[-1]) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) #print('Adversarial loss:', avg_adv_loss.value()) # output is list if task_id = 'src' return output, constraint, adv_loss prev = forward_sequence prev_rev = backward_sequence raise Exception("oops should not be here") return None
def pick_neg_log(self, pred, gold): if not isinstance(gold, int): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))