def attend(self, input_mat, state, w1dt, w2, v, coverage): w2dt = w2 * dy.concatenate(list(state.s())) if coverage: w1dt = w1dt + self.w_cov * dy.transpose(coverage) a_t = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) a_t = dy.softmax(a_t) return a_t, (input_mat * a_t)
def __call__(self, h, s): # hT -> ((L, h_dim), B), s -> ((s_dim, L), B) if len(h.dim()[0]) == 2: L = h.dim()[0][1] if self.h_bias: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, L), dtype=np.float32))]) if self.s_bias: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, L), dtype=np.float32))]) else: if self.h_bias: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) if self.s_bias: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) hT = dy.transpose(h) lin = self.U * s # ((h_dim*n_label, L), B) if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = hT * lin if self.n_label == 1: return blin else: return dy.transpose(blin)
def get_score(self,word,context): ## Get the loss given word, context pair and perform negative sampling objective = dy.logistic(((dy.transpose(self.context_embeddings[context]))*self.word_embeddings[word])) negative_sample = np.random.choice(self.context_size, self.num_sampled, replace=False, p=self.context_fre) for context_prime in negative_sample: objective *= dy.logistic(-((dy.transpose(self.context_embeddings[context_prime]))*self.word_embeddings[word])) loss = -dy.log(objective) return loss
def __call__(self, h, s): # hT -> ((L, h_dim), B), s -> ((s_dim, L), B) hT = dy.transpose(h) lin = self.U * s # ((h_dim*n_label, L), B) if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = hT * lin if self.n_label == 1: return blin + (hT * self.B if self.bias else 0) else: return dy.transpose(blin) + (self.V * dy.concatenate([h, s]) + self.B if self.bias else 0)
def __call__(self, h, s): if self.h_bias: if len(h.dim()[0]) == 2: h = dy.concatenate([ h, dy.inputTensor( np.ones((1, h.dim()[0][1]), dtype=np.float32)) ]) else: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) if self.s_bias: if len(s.dim()[0]) == 2: s = dy.concatenate([ s, dy.inputTensor( np.ones((1, s.dim()[0][1]), dtype=np.float32)) ]) else: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) lin = self.U * s if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = dy.transpose(h) * lin return blin
def selection_by_tree(self, tree, mode, idx=0): input_layers, pairs = self._select_by_tree(tree, mode, True) if len(pairs) == 0: if not self.opt['allow_partial']: input_layers, pairs = self._select_by_tree(tree, mode, False) else: print 'early stop! discard {} / {}.'.format( len(tree.V), len(tree.terms)) return None, None W1_rl = dy.parameter(self.model_parameters['W1_rl']) b1_rl = dy.parameter(self.model_parameters['b1_rl']) if not self.opt['one_layer']: W2_rl = dy.parameter(self.model_parameters['W2_rl']) b2_rl = dy.parameter(self.model_parameters['b2_rl']) # pr = W2_rl * dy.rectify(W1_rl * dy.concatenate_to_batch(input_layers) + b1_rl) + b2_rl # (V x N)x160 160x50 50x60 60x1 input_layers = dy.concatenate_cols(input_layers) input_layers = dy.transpose(input_layers) if not self.opt['one_layer']: if self.opt['use_history']: pr = input_layers * dy.rectify(W2_rl * dy.rectify( W1_rl * self.history[idx].output() + b1_rl) + b2_rl) else: pr = dy.rectify(input_layers * W2_rl + b2_rl) * W1_rl + b1_rl else: if self.opt['use_history']: pr = input_layers * dy.rectify( W1_rl * self.history[idx].output() + b1_rl) else: pr = input_layers * W1_rl + b1_rl # (#actions, ) pr = dy.reshape(pr, (len(pairs), )) return dy.softmax(pr), pairs
def predict(self, feature_vector, task_ids, train=False, soft_labels=False, temperature=None, dropout_rate=0.0, orthogonality_weight=0.0, domain_id=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) outputs = [] for task_id in task_ids: output = self.output_layers_dict[task_id](input, soft_labels=soft_labels, temperature=temperature) outputs.append(output) constraint, adv_loss = 0, 0 if orthogonality_weight != 0: # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP F0_layer = self.output_layers_dict["F0"] F1_layer = self.output_layers_dict["F1"] F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W F0_W = dynet.parameter(F0_param) F1_W = dynet.parameter(F1_param) # calculate the matrix product of the task matrix with both others matrix_product = dynet.transpose(F0_W) * F1_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) constraint += squared_frobenius_norm # print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient(input) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) # print('Adversarial loss:', avg_adv_loss.value()) return outputs, constraint, adv_loss
def set_initial_states(self, x): self.xt_embs = [dy.lookup(self.F, x_t) for x_t in x] if self.encoder_type == 'bow': self.W_enc = self.W * dy.average(self.xt_embs) elif self.encoder_type == 'attention': self.xb = dy.concatenate([ dy.esum(self.xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) self.xt = dy.transpose(dy.concatenate(self.xt_embs, d=1))
def __call__(self, x, h_matrix, noprob=False): s_t = x for i in range(self.layers - 1): e_t = self.V[i] * dy.tanh(self.W1[i] * h_matrix + self.W2[i] * s_t) a_t = dy.softmax(dy.transpose(e_t)) c_t = h_matrix * a_t s_t = dy.concatenate([x, c_t]) e_t = self.V[-1] * dy.tanh(self.W1[-1] * h_matrix + self.W2[-1] * s_t) + self.B1 * h_matrix + self.B2 * s_t if len(h_matrix.dim()[0]) > 1: e_t = dy.reshape(e_t, (self.V[-1].dim()[0][0] * h_matrix.dim()[0][1], )) if not noprob: p_t = dy.softmax(e_t) return p_t else: return e_t
def main(): parser = argparse.ArgumentParser( description= 'Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument( '--train_x_path', type=str, default='./data/train_x.txt', help='File path of train x data [default: `./data/train_x.txt`]') parser.add_argument( '--train_y_path', type=str, default='./data/train_y.txt', help='File path of train y data [default: `./data/train_x.txt`]') parser.add_argument( '--valid_x_path', type=str, default='./data/valid_x.txt', help='File path of valid x data [default: `./data/valid_x.txt`]') parser.add_argument( '--valid_y_path', type=str, default='./data/valid_y.txt', help='File path of valid y data [default: `./data/valid_y.txt`]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--batch_size', type=int, default=64, help='Mini batch size [default: 64]') parser.add_argument('--win_sizes', type=int, nargs='*', default=[3, 4, 5], help='Window sizes of filters [default: [3, 4, 5]]') parser.add_argument( '--num_fil', type=int, default=100, help='Number of filters in each window size [default: 100]') parser.add_argument('--s', type=float, default=3.0, help='L2 norm constraint on w [default: 3.0]') parser.add_argument('--dropout_prob', type=float, default=0.5, help='Dropout probability [default: 0.5]') parser.add_argument( '--v_strategy', type=str, default='static', help= 'Embedding strategy. rand: Random initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]' ) parser.add_argument( '--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) N_EPOCHS = args.n_epochs WIN_SIZES = args.win_sizes BATCH_SIZE = args.batch_size EMB_DIM = 300 OUT_DIM = 1 L2_NORM_LIM = args.s NUM_FIL = args.num_fil DROPOUT_PROB = args.dropout_prob V_STRATEGY = args.v_strategy ALLOC_MEM = args.alloc_mem if V_STRATEGY in ['rand', 'static', 'non-static']: NUM_CHA = 1 else: NUM_CHA = 2 # FILE paths W2V_PATH = './GoogleNews-vectors-negative300.bin' TRAIN_X_PATH = args.train_x_path TRAIN_Y_PATH = args.train_y_path VALID_X_PATH = args.valid_x_path VALID_Y_PATH = args.valid_y_path # DyNet setting dyparams = dy.DynetParams() dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load pretrained embeddings pretrained_model = gensim.models.KeyedVectors.load_word2vec_format( W2V_PATH, binary=True) vocab = pretrained_model.wv.vocab.keys() w2v = pretrained_model.wv # Build dataset ======================================================================================================= w2c = build_w2c(TRAIN_X_PATH, vocab=vocab) w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk') train_x, train_y = build_dataset(TRAIN_X_PATH, TRAIN_Y_PATH, w2i, unk='unk') valid_x, valid_y = build_dataset(VALID_X_PATH, VALID_Y_PATH, w2i, unk='unk') train_x, train_y = sort_data_by_length(train_x, train_y) valid_x, valid_y = sort_data_by_length(valid_x, valid_y) VOCAB_SIZE = len(w2i) print('VOCAB_SIZE:', VOCAB_SIZE) V_init = init_V(w2v, w2i) with open(os.path.join(RESULTS_DIR, './w2i.dump'), 'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'), 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w) # Build model ================================================================================= model = dy.Model() trainer = dy.AdamTrainer(model) # V1 V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) if V_STRATEGY in ['static', 'non-static', 'multichannel']: V1.init_from_array(V_init) if V_STRATEGY in ['static', 'multichannel']: V1_UPDATE = False else: # 'rand', 'non-static' V1_UPDATE = True make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) # V2 if V_STRATEGY == 'multichannel': V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) V2.init_from_array(V_init) V2_UPDATE = True make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) layers = [ CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh, DROPOUT_PROB), Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic) ] # Train model ================================================================================ n_batches_train = math.ceil(len(train_x) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] pred_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(train_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=False) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_train.append(mb_loss.value()) pred_all_train.extend(list(binary_pred(y.npvalue().flatten()))) # Backward prop mb_loss.backward() trainer.update() # L2 norm constraint layers[1].scale_W(L2_NORM_LIM) # Make padding embs zero if V_STRATEGY in ['rand', 'non-static']: make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) elif V_STRATEGY in ['multichannel']: make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) # Valid loss_all_valid = [] pred_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(valid_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=True) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_valid.append(mb_loss.value()) pred_all_valid.extend(list(binary_pred(y.npvalue().flatten()))) print( 'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]' % ( epoch + 1, np.mean(loss_all_train), f1_score(train_y, pred_all_train), accuracy_score(train_y, pred_all_train), np.mean(loss_all_valid), f1_score(valid_y, pred_all_valid), accuracy_score(valid_y, pred_all_valid), time.time() - start_time, )) # Save model ========================================================================================================================= if V_STRATEGY in ['rand', 'static', 'non-static']: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1] + layers) else: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1, V2] + layers)
def __call__(self, x=None, t=None, test=False): if test: tt_embs = [dy.lookup(self.E, t_t) for t_t in t] if self.encoder_type == 'bow': # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Output with softmax y_t = dy.softmax(self.V * h + self.W_enc) elif self.encoder_type == 'attention': ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate(ttp_embs) p = dy.softmax(self.xt * self.P * ttp_c) # Attention weight enc = self.xb * p # Context vector # Output with softmax y_t = dy.softmax(self.V * h + self.W * enc) return y_t else: xt_embs = [dy.lookup(self.F, x_t) for x_t in x] tt_embs = [dy.lookup(self.E, t_t) for t_t in t] y = [] if self.encoder_type == 'bow': # BoW enc = dy.average(xt_embs) W_enc = self.W * enc for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Output without softmax y_t = self.V * h + W_enc y.append(y_t) elif self.encoder_type == 'attention': xb = dy.concatenate([ dy.esum(xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) xt = dy.transpose(dy.concatenate(xt_embs, d=1)) ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate( ttp_embs[i:i + self.c]) # Window-sized embedding p = dy.softmax(xt * self.P * ttp_c) # Attention weight enc = xb * p # Context vector # Output without softmax y_t = self.V * h + self.W * enc y.append(y_t) return y
def __call__(self, x, tm1s=None, test=False): if test: # Initial states s_tm1 = tm1s[0] c_tm1 = tm1s[1] w_tm1 = x # GRU s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input( dy.concatenate([w_tm1, c_tm1])).output() # Attention e_t = dy.pick( self.va * dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = dy.softmax(self.Wo * m_t) return s_t, c_t, y_t else: w_embs = x # Initial states s_tm1 = self.s_0 c_tm1 = self.c_0 GRU = self.GRUBuilder.initial_state().set_s([s_tm1]) y = [] for w_tm1 in w_embs: # GRU GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1])) s_t = GRU.output() # Attention e_t = dy.pick( self.va * dy.tanh( dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = self.Wo * m_t y.append(y_t) # t -> tm1 s_tm1 = s_t c_tm1 = c_t return y
def main(): parser = argparse.ArgumentParser(description='Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=-1, help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--model_file', type=str, default='./model', help='Model to use for prediction [default: ./model]') parser.add_argument('--input_file', type=str, default='./data/valid_x.txt', help='Input file path [default: ./data/valid_x.txt]') parser.add_argument('--output_file', type=str, default='./pred_y.txt', help='Output file path [default: ./pred_y.txt]') parser.add_argument('--w2i_file', type=str, default='./w2i.dump', help='Word2Index file path [default: ./w2i.dump]') parser.add_argument('--i2w_file', type=str, default='./i2w.dump', help='Index2Word file path [default: ./i2w.dump]') parser.add_argument('--alloc_mem', type=int, default=1024, help='Amount of memory to allocate [mb] [default: 1024]') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) MODEL_FILE = args.model_file INPUT_FILE = args.input_file OUTPUT_FILE = args.output_file W2I_FILE = args.w2i_file I2W_FILE = args.i2w_file ALLOC_MEM = args.alloc_mem # DyNet setting dyparams = dy.DynetParams() dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load model model = dy.Model() pretrained_model = dy.load(MODEL_FILE, model) if len(pretrained_model) == 3: V1, layers = pretrained_model[0], pretrained_model[1:] MULTICHANNEL = False else: V1, V2, layers = pretrained_model[0], pretrained_model[1], pretrained_model[2:] MULTICHANNEL = True EMB_DIM = V1.shape()[0] WIN_SIZES = layers[0].win_sizes # Load test data with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w: w2i = pickle.load(f_w2i) i2w = pickle.load(f_i2w) max_win = max(WIN_SIZES) test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, unksym='unk') test_X = [[0]*max_win + instance_x + [0]*max_win for instance_x in test_X] # Pred pred_y = [] for instance_x in tqdm(test_X): # Create a new computation graph dy.renew_cg() associate_parameters(layers) sen_len = len(instance_x) if MULTICHANNEL: x_embs1 = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1) x_embs2 = dy.concatenate([dy.lookup(V2, x_t, update=False) for x_t in instance_x], d=1) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) else: x_embs = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) y = f_props(layers, x_embs, train=False) pred_y.append(str(int(binary_pred(y.value())))) with open(OUTPUT_FILE, 'w') as f: f.write('\n'.join(pred_y))
def __call__(self, s_t, h_matrix): e_t = self.v * dy.tanh(self.W1*h_matrix + self.W2 * s_t) a_t = dy.softmax(dy.transpose(e_t)) c_t = h_matrix * a_t return c_t
def predict(self, word_indices, char_indices, task_id, train=False, soft_labels=False, temperature=None, orthogonality_weight=0.0, domain_id=None): """ predict tags for a sentence represented as char+word embeddings :param domain_id: Predict adversarial loss if domain id is provided. """ dynet.renew_cg() # new graph char_emb = [] rev_char_emb = [] wfeatures = [self.wembeds[w] for w in word_indices] if self.c_in_dim > 0: # get representation for words for chars_of_token in char_indices: char_feats = [self.cembeds[c] for c in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence( char_feats, char_feats) last_state = f_char[-1] rev_last_state = b_char[-1] char_emb.append(last_state) rev_char_emb.append(rev_last_state) features = [ dynet.concatenate([w, c, rev_c]) for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] output_expected_at_layer = self.h_layers output_expected_at_layer -= 1 # go through layers prev = features prev_rev = features num_layers = self.h_layers constraint = 0 adv_loss = 0 for i in range(0, num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence( prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] if i == output_expected_at_layer: concat_layer = [ dynet.concatenate([f, b]) for f, b in zip( forward_sequence, reversed(backward_sequence)) ] if train and self.noise_sigma > 0.0: concat_layer = [ dynet.noise(fe, self.noise_sigma) for fe in concat_layer ] if task_id not in ["src", "trg"]: output_predictor = self.predictors["output_layers_dict"][ task_id] output = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) else: # one src example for all three outputs output = [] # in this case it is a list for t_id in self.task_ids: output_predictor = self.predictors[ "output_layers_dict"][t_id] output_t = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) output.append(output_t) if orthogonality_weight != 0 and task_id != "Ft": # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP # use orthogonality_weight only between F0 and F1 builder = self.predictors["output_layers_dict"][ "F0"].network_builder task_param = builder.W_mlp if self.add_hidden else builder.W task_W = dynet.parameter(task_param) builder = self.predictors["output_layers_dict"][ "F1"].network_builder other_param = builder.W_mlp if self.add_hidden else builder.W other_task_W = dynet.parameter(other_param) # calculate the matrix product of the task matrix with the other matrix_product_1 = dynet.transpose(task_W) * other_task_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product_1)) constraint = squared_frobenius_norm #print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient( concat_layer[-1]) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) #print('Adversarial loss:', avg_adv_loss.value()) # output is list if task_id = 'src' return output, constraint, adv_loss prev = forward_sequence prev_rev = backward_sequence raise Exception("oops should not be here") return None