def selection_by_tree(self, tree, mode, idx=0): input_layers, pairs = self._select_by_tree(tree, mode, True) if len(pairs) == 0: if not self.opt['allow_partial']: input_layers, pairs = self._select_by_tree(tree, mode, False) else: print 'early stop! discard {} / {}.'.format( len(tree.V), len(tree.terms)) return None, None W1_rl = dy.parameter(self.model_parameters['W1_rl']) b1_rl = dy.parameter(self.model_parameters['b1_rl']) if not self.opt['one_layer']: W2_rl = dy.parameter(self.model_parameters['W2_rl']) b2_rl = dy.parameter(self.model_parameters['b2_rl']) # pr = W2_rl * dy.rectify(W1_rl * dy.concatenate_to_batch(input_layers) + b1_rl) + b2_rl # (V x N)x160 160x50 50x60 60x1 input_layers = dy.concatenate_cols(input_layers) input_layers = dy.transpose(input_layers) if not self.opt['one_layer']: if self.opt['use_history']: pr = input_layers * dy.rectify(W2_rl * dy.rectify( W1_rl * self.history[idx].output() + b1_rl) + b2_rl) else: pr = dy.rectify(input_layers * W2_rl + b2_rl) * W1_rl + b1_rl else: if self.opt['use_history']: pr = input_layers * dy.rectify( W1_rl * self.history[idx].output() + b1_rl) else: pr = input_layers * W1_rl + b1_rl # (#actions, ) pr = dy.reshape(pr, (len(pairs), )) return dy.softmax(pr), pairs
def associate_parameters(self): self.U = dy.parameter(self._U) self.V = dy.parameter(self._V) self.W = dy.parameter(self._W) if self.encoder_type == 'attention': self.P = dy.parameter(self._P)
def predict(self, feature_vector, task_ids, train=False, soft_labels=False, temperature=None, dropout_rate=0.0, orthogonality_weight=0.0, domain_id=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) outputs = [] for task_id in task_ids: output = self.output_layers_dict[task_id](input, soft_labels=soft_labels, temperature=temperature) outputs.append(output) constraint, adv_loss = 0, 0 if orthogonality_weight != 0: # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP F0_layer = self.output_layers_dict["F0"] F1_layer = self.output_layers_dict["F1"] F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W F0_W = dynet.parameter(F0_param) F1_W = dynet.parameter(F1_param) # calculate the matrix product of the task matrix with both others matrix_product = dynet.transpose(F0_W) * F1_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) constraint += squared_frobenius_norm # print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient(input) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) # print('Adversarial loss:', avg_adv_loss.value()) return outputs, constraint, adv_loss
def compute_output_layer(self, input): res = [input] for i, p in enumerate(self.parameters): W, b = dy.parameter(p[0]), dy.parameter(p[1]) if i == len(self.parameters) - 1: res.append(dy.logistic(W * res[-1] + b)) else: res.append(self.activation(W * res[-1] + b)) return res
def generate(self, num, limit=40, beam=3): dy.renew_cg() generated = [] W = dy.parameter(self.W) b = dy.parameter(self.b) for wordi in range(num): # Initialize the LSTM state with EOW token. start_state = self.lstm.initial_state() start_state = start_state.add_input(self.lookup[self.c2i[EOW]]) best_states = [('', start_state, 0)] final_hypotheses = [] # Perform beam search. while len(final_hypotheses) < beam and len(best_states) > 0: new_states = [] for hyp, s, p in best_states: # Cutoff when we exceed the character limit. if len(hyp) >= limit: final_hypotheses.append((hyp, p)) continue # Get the prediction from the current LSTM state. unnormalized = dy.affine_transform([b, W, s.output()]) softmax = dy.softmax(unnormalized).npvalue() # Sample beam number of times. for beami in range(beam): ci = sample_softmax(softmax) c = self.i2c[ci] next_p = softmax[ci] logp = p - np.log(next_p) if c == EOW: # Add final hypothesis if we reach end of word. final_hypotheses.append((hyp, logp)) else: # Else add to states to search next time step. new_states.append((hyp + c, s.add_input(self.lookup[ci]), logp)) # Sort and prune the states to within the beam. new_states.sort(key=lambda t: t[-1]) best_states = new_states[:beam] final_hypotheses.sort(key=lambda t: t[-1]) generated.append(final_hypotheses[0][0]) return generated
def get_top_k_paths(self, all_paths, relation_index, threshold): """ Get the top k scoring paths """ builder = self.builder model = self.model model_parameters = self.model_parameters lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] path_scores = [] for i, path in enumerate(all_paths): if i % 1000 == 0: cg = dy.renew_cg() W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if self.num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim) path_embedding = dy.concatenate( [zero_word, path_embedding, zero_word]) h = W1 * path_embedding + b1 if self.num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 path_score = dy.softmax(h).npvalue().T path_scores.append(path_score) path_scores = np.vstack(path_scores) top_paths = [] for i in range(len(relation_index)): indices = np.argsort(-path_scores[:, i]) top_paths.append([ (all_paths[index], path_scores[index, i]) for index in indices if threshold is None or path_scores[index, i] >= threshold ]) return top_paths
def associate_parameters(self): self.Wd = dy.parameter(self._Wd) self.bd = dy.parameter(self._bd) self.Wa = dy.parameter(self._Wa) self.Ua = dy.parameter(self._Ua) self.va = dy.parameter(self._va) self.Wr = dy.parameter(self._Wr) self.Ur = dy.parameter(self._Ur) self.Vr = dy.parameter(self._Vr) self.Wo = dy.parameter(self._Wo)
def do_cpu(): C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward()
def select_action(tree, policy, choose_max=False, return_prob=False, mode='train'): prob, pairs = policy.selection_by_tree(tree, mode) if pairs is None: if return_prob: return None, None, None, None else: return None, None, None with np.errstate(all='raise'): try: prob_v = prob.npvalue() if choose_max: idx = np.argmax(prob_v) else: # if np.random.random() < policy.epsilon: # idx = np.random.randint(len(prob_v)) # while prob_v[idx] == 0: # idx = np.random.randint(len(prob_v)) # else: idx = np.random.choice(range(len(prob_v)), p=prob_v / np.sum(prob_v)) except: for para in policy.model_parameters: check_error(para, dy.parameter(policy.model_parameters[para])) check_error('history', policy.history.output()) check_error('pr', prob) action = prob[idx] policy.saved_actions[-1].append(action) policy.update_history(pairs[idx]) if return_prob: return pairs[idx], prob_v[idx], pairs, prob_v return pairs[idx], prob_v[idx], dy.mean_elems(dy.cmult(prob, dy.log(prob)))
def train_fake(self, input, targets, epsilon = 1e-10): init_states = [input, dy.zeros(self.dim_lstm)] state = self.lstm.initial_state(init_states) loss = dy.zeros(1) W = dy.parameter(self.h2o) b = dy.parameter(self.b) state = state.add_input(self.lu[targets[0]]) for target in targets[1:]: loss += dy.pickneglogsoftmax(W * state.output() + b + epsilon, target) embedding = self.lu[target] state = state.add_input(embedding) return loss
def __call__(self, x, soft_labels=False, temperature=None): if self.mlp: W_mlp = dynet.parameter(self.W_mlp) b_mlp = dynet.parameter(self.b_mlp) act = self.mlp_activation x_in = act(W_mlp * x + b_mlp) else: x_in = x # from params to expressions W = dynet.parameter(self.W) b = dynet.parameter(self.b) logits = (W * x_in + b) + dynet.scalarInput(1e-15) if soft_labels and temperature: # calculate the soft labels smoothed with the temperature # see Distilling the Knowledge in a Neural Network elems = dynet.exp(logits / temperature) return dynet.cdiv(elems, dynet.sum_elems(elems)) return self.act(logits)
def __call__(self, x, soft_labels=False, temperature=None, train=False): if self.mlp: W_mlp = dynet.parameter(self.W_mlp) b_mlp = dynet.parameter(self.b_mlp) act = self.mlp_activation x_in = act(W_mlp * x + b_mlp) else: x_in = x # from params to expressions W = dynet.parameter(self.W) b = dynet.parameter(self.b) logits = W*x_in + b if soft_labels and temperature: # calculate the soft labels smoothed with the temperature # see Distilling the Knowledge in a Neural Network elems = dynet.exp(logits / temperature) return dynet.cdiv(elems, dynet.sum_elems(elems)) if self.act: return self.act(logits) return logits
def train_batch(self, words): losses = [] W = dy.parameter(self.W) b = dy.parameter(self.b) for word in words: wlosses = [] word = self.word_to_indices(word) s = self.lstm.initial_state() for c, next_c in zip(word, word[1:]): s = s.add_input(self.lookup[c]) unnormalized = dy.affine_transform([b, W, s.output()]) wlosses.append(dy.pickneglogsoftmax(unnormalized, next_c)) losses.append(dy.esum(wlosses) / len(word)) return dy.esum(losses) / len(words)
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000,1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward() print("CPU time:",time.time() - s)
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000, 1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W * W * W * W * W * W * W z = C.squared_distance(W, W) z.value() z.backward() print("CPU time:", time.time() - s)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000,1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W*W*W*W*W*W*W z = G.squared_distance(W,W) z.value() z.backward() print("GPU time:",time.time() - s)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000, 1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W * W * W * W * W * W * W z = G.squared_distance(W, W) z.value() z.backward() print("GPU time:", time.time() - s)
def associate_parameters(self): self.Ws = [dy.parameter(_W) for _W in self._Ws] self.bs = [dy.parameter(_b) for _b in self._bs]
def process_one_instance(builder, model, model_parameters, instance, path_cache, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Return the LSTM output vector of a single term-pair - the average path embedding :param builder: the LSTM builder :param model: the LSTM model :param model_parameters: the model parameters :param instance: a Counter object with paths :param path_cache: the cache for path embeddings :param update: whether to update the lemma embeddings :param dropout: word dropout rate :param x_y_vectors: the current word vectors for x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network :return: the LSTM output vector of a single term-pair """ W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] # Use the LSTM output vector and feed it to the MLP # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embbedings = [ get_path_embedding_from_cache(path_cache, builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, dropout) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embbedings) * (1.0 / num_paths) # Concatenate x and y embeddings if x_y_vectors is not None: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) input_vec = dy.concatenate([x_vector, input_vec, y_vector]) h = W1 * input_vec + b1 if num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 output = dy.softmax(h) return output
def predict_greedy(self, encoder, input_seq): dn.renew_cg() self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) alphas_mtx = [] if len(input_seq) == 0: return [] # encode input sequence blstm_outputs, input_masks = encoder.encode_batch([input_seq]) # initialize the decoder rnn s = self.decoder_rnn.initial_state() # set prev_output_vec for first lstm step as BEGIN_WORD concatenated with special padding vector prev_output_vec = dn.concatenate([ self.output_lookup[self.y2int[common.BEGIN_SEQ]], self.init_lookup[0] ]) predicted_sequence = [] i = 0 # run the decoder through the sequence and predict output symbols while (self.max_prediction_len is None) or (i < self.max_prediction_len): # get current h of the decoder s = s.add_input(prev_output_vec) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas = self.attend( blstm_outputs, decoder_rnn_output) if self.plot: val = alphas.vec_value() alphas_mtx.append(val) # compute output probabilities # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # TODO: understand why diverse needs tanh before softmax if self.diverse: h = dn.tanh(h) probs = dn.softmax(h) # find best candidate output - greedy next_element_index = np.argmax(probs.npvalue()) predicted_sequence.append(self.int2y[next_element_index]) # check if reached end of word if predicted_sequence[-1] == common.END_SEQ: break # prepare for the next iteration - "feedback" prev_output_vec = dn.concatenate([ self.output_lookup[next_element_index], attention_output_vector ]) i += 1 # remove the end seq symbol return predicted_sequence[0:-1], alphas_mtx
def associate_parameters(self): self.Ws = dy.parameter(self._Ws) self.Us = dy.parameter(self._Us) self.bs = dy.parameter(self._bs) self.hf_0 = dy.zeroes((self.hid_dim)) self.hb_0 = dy.zeroes((self.hid_dim))
def predict_beamsearch(self, encoder, input_seq): if len(input_seq) == 0: return [] dn.renew_cg() self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) alphas_mtx = [] # encode input sequence blstm_outputs, input_masks = encoder.encode_batch([input_seq]) # complete sequences and their probabilities final_states = [] # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # holds beam step index mapped to (sequence, probability, decoder state, attn_vector) tuples beam = {-1: [([common.BEGIN_SEQ], 1.0, s_0, self.init_lookup[0])]} i = 0 # expand another step if didn't reach max length and there's still beams to expand #while i < self.max_prediction_len and len(beam[i - 1]) > 0: while ((self.max_prediction_len is None) or (i < self.max_prediction_len)) and len(beam[i - 1]) > 0: # create all expansions from the previous beam: new_hypos = [] for hypothesis in beam[i - 1]: prefix_seq, prefix_prob, prefix_decoder, prefix_attn = hypothesis last_hypo_symbol = prefix_seq[-1] # cant expand finished sequences if last_hypo_symbol == common.END_SEQ: continue # expand from the last symbol of the hypothesis try: prev_output_vec = self.output_lookup[ self.y2int[last_hypo_symbol]] except KeyError: # not a known symbol print 'impossible to expand, key error: ' + str( last_hypo_symbol) continue decoder_input = dn.concatenate([prev_output_vec, prefix_attn]) s = prefix_decoder.add_input(decoder_input) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas = self.attend( blstm_outputs, decoder_rnn_output) # save attention weights for plotting # TODO: add attention weights properly to allow building the attention matrix for the best path if self.plot: val = alphas.vec_value() alphas_mtx.append(val) # compute output probabilities # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # TODO: understand why diverse needs tanh before softmax if self.diverse: h = dn.tanh(h) probs = dn.softmax(h) probs_val = probs.npvalue() # TODO: maybe should choose nbest from all expansions and not only from nbest of each hypothesis? # find best candidate outputs n_best_indices = common.argmax(probs_val, self.beam_size) for index in n_best_indices: p = probs_val[index] new_seq = prefix_seq + [self.int2y[index]] new_prob = prefix_prob * p #if new_seq[-1] == common.END_SEQ or i == self.max_prediction_len - 1: if new_seq[-1] == common.END_SEQ or ( (self.max_prediction_len is not None) and (i == self.max_prediction_len - 1)): # TODO: add to final states only if fits in k best? # if found a complete sequence or max length - add to final states final_states.append((new_seq[1:-1], new_prob)) else: new_hypos.append( (new_seq, new_prob, s, attention_output_vector)) # add the most probable expansions from all hypotheses to the beam new_probs = np.array([p for (s, p, r, a) in new_hypos]) argmax_indices = common.argmax(new_probs, self.beam_size) beam[i] = [new_hypos[l] for l in argmax_indices] i += 1 # get nbest results from final states found in search final_probs = np.array([p for (s, p) in final_states]) argmax_indices = common.argmax(final_probs, self.beam_size) nbest_seqs = [final_states[l] for l in argmax_indices] return nbest_seqs, alphas_mtx
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch( self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend( encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1, ), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate( [feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def associate_parameters(self): self.W = dy.parameter(self._W) self.b = dy.parameter(self._b)
def predict(self, word_indices, char_indices, task_id, train=False, soft_labels=False, temperature=None, orthogonality_weight=0.0, domain_id=None): """ predict tags for a sentence represented as char+word embeddings :param domain_id: Predict adversarial loss if domain id is provided. """ dynet.renew_cg() # new graph char_emb = [] rev_char_emb = [] wfeatures = [self.wembeds[w] for w in word_indices] if self.c_in_dim > 0: # get representation for words for chars_of_token in char_indices: char_feats = [self.cembeds[c] for c in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence( char_feats, char_feats) last_state = f_char[-1] rev_last_state = b_char[-1] char_emb.append(last_state) rev_char_emb.append(rev_last_state) features = [ dynet.concatenate([w, c, rev_c]) for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] output_expected_at_layer = self.h_layers output_expected_at_layer -= 1 # go through layers prev = features prev_rev = features num_layers = self.h_layers constraint = 0 adv_loss = 0 for i in range(0, num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence( prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] if i == output_expected_at_layer: concat_layer = [ dynet.concatenate([f, b]) for f, b in zip( forward_sequence, reversed(backward_sequence)) ] if train and self.noise_sigma > 0.0: concat_layer = [ dynet.noise(fe, self.noise_sigma) for fe in concat_layer ] if task_id not in ["src", "trg"]: output_predictor = self.predictors["output_layers_dict"][ task_id] output = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) else: # one src example for all three outputs output = [] # in this case it is a list for t_id in self.task_ids: output_predictor = self.predictors[ "output_layers_dict"][t_id] output_t = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) output.append(output_t) if orthogonality_weight != 0 and task_id != "Ft": # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP # use orthogonality_weight only between F0 and F1 builder = self.predictors["output_layers_dict"][ "F0"].network_builder task_param = builder.W_mlp if self.add_hidden else builder.W task_W = dynet.parameter(task_param) builder = self.predictors["output_layers_dict"][ "F1"].network_builder other_param = builder.W_mlp if self.add_hidden else builder.W other_task_W = dynet.parameter(other_param) # calculate the matrix product of the task matrix with the other matrix_product_1 = dynet.transpose(task_W) * other_task_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product_1)) constraint = squared_frobenius_norm #print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient( concat_layer[-1]) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) #print('Adversarial loss:', avg_adv_loss.value()) # output is list if task_id = 'src' return output, constraint, adv_loss prev = forward_sequence prev_rev = backward_sequence raise Exception("oops should not be here") return None
def get_graph(self, embedding): dy.renew_cg() w = dy.parameter(self.pW) u = dy.parameter(self.pU) return u * dy.tanh(w * dy.inputTensor(embedding))