def __call__(self, h, s): if self.h_bias: if len(h.dim()[0]) == 2: h = dy.concatenate([ h, dy.inputTensor( np.ones((1, h.dim()[0][1]), dtype=np.float32)) ]) else: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) if self.s_bias: if len(s.dim()[0]) == 2: s = dy.concatenate([ s, dy.inputTensor( np.ones((1, s.dim()[0][1]), dtype=np.float32)) ]) else: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) lin = self.U * s if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = dy.transpose(h) * lin return blin
def __call__(self, h, s): # hT -> ((L, h_dim), B), s -> ((s_dim, L), B) if len(h.dim()[0]) == 2: L = h.dim()[0][1] if self.h_bias: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, L), dtype=np.float32))]) if self.s_bias: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, L), dtype=np.float32))]) else: if self.h_bias: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) if self.s_bias: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) hT = dy.transpose(h) lin = self.U * s # ((h_dim*n_label, L), B) if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = hT * lin if self.n_label == 1: return blin else: return dy.transpose(blin)
def __call__(self, input_exp, hidden_exp, mask=None): # two kinds of dropouts if self.idrop > 0.: input_exp = dy.dropout(input_exp, self.idrop) input_exp_g = input_exp_t = input_exp hidden_exp_g = hidden_exp_t = hidden_exp["H"] if self.gdrop > 0.: input_exp_g = dy.cmult(input_exp, self.masks[0]) hidden_exp_g = dy.cmult(hidden_exp_g, self.masks[1]) input_exp_t = dy.cmult(input_exp, self.masks[2]) hidden_exp_t = dy.cmult(hidden_exp_t, self.masks[3]) rzt = dy.affine_transform([ self.iparams["brz"], self.iparams["x2rz"], input_exp_g, self.iparams["h2rz"], hidden_exp_g ]) rzt = dy.logistic(rzt) rt, zt = dy.pick_range(rzt, 0, self.n_hidden), BK.pick_range( rzt, self.n_hidden, 2 * self.n_hidden) h_reset = dy.cmult(rt, hidden_exp_t) ht = dy.affine_transform([ self.iparams["bh"], self.iparams["x2h"], input_exp_t, self.iparams["h2h"], h_reset ]) ht = dy.tanh(ht) hidden = dy.cmult(zt, hidden_exp["H"]) + dy.cmult( (1. - zt), ht) # first one use original hh # mask: if 0 then pass through if mask is not None: mask_array = np.asarray(mask).reshape((1, -1)) m1 = dy.inputTensor(mask_array, True) # 1.0 for real words m0 = dy.inputTensor(1.0 - mask_array, True) # 1.0 for padding words (mask=0) hidden = hidden * m1 + hidden_exp["H"] * m0 return {"H": hidden}
def get_features(self, words, train=False, update=True): """ get feature representations """ # word embeddings wfeatures = np.array([ self.get_w_repr(word, train=train, update=update) for word in words ]) lex_features = [] if self.dictionary and not self.type_constraint: ## add lexicon features lex_features = np.array( [self.get_lex_repr(word) for word in words]) # char embeddings if self.c_in_dim > 0: cfeatures = [self.get_c_repr(word, train=train) for word in words] if len(lex_features) > 0: lex_features = dynet.inputTensor(lex_features) features = [ dynet.concatenate([w, c, l]) for w, c, l in zip(wfeatures, cfeatures, lex_features) ] else: features = [ dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] return features
def evaluate_network(self, x_np, apply_final_activation=True, dropout=False): """ return an expression that is the result of feeding the input through the entire network, except the last activation """ #self.check_input_size(x_np) n_stages = self.n_layers-1 # will be skipped for x_np that are already # _dynet.__tensorInputExpression or _dynet._vecInputExpression if type(x_np) == np.ndarray: print "ndarray" x = dy.vecInput() x.set(x_np) elif type(x_np) == list: print "list" x = dy.inputTensor(x_np, batched = True) else: x = x_np final_activation = self.output_activation if apply_final_activation else lambda x: x activation = self.hidden_activation for i, W, b in zip(range(n_stages), self.params["W"], self.params["b"]): #print "i", i if i == n_stages-1: # print "final layer" activation = final_activation x = activation(W*x + b) if dropout: x = dy.dropout(x, DROPOUT_RATE) return x
def GCN(self, A, H): ''' GCN: H_2 = RELU(D^{-0.5} * A * D^{-0.5} * H * W_GCN) ''' D = np.diag(np.power(np.sum(A, axis=0), -0.5)) C = np.dot(np.dot(D, A), D) H_2 = dy.rectify(dy.inputTensor(C) * H * self.W_GCN) return H_2
def attend(self, encoded_inputs, h_t, input_masks=None): # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder) if len(encoded_inputs) == 1: # no need to attend if only one input state, compute output directly h_output = dn.tanh(self.w_c * dn.concatenate([h_t, encoded_inputs[0]])) # return trivial alphas (all 1's since one input gets all attention) if input_masks: # if batching alphas = dn.inputTensor([1] * len(input_masks[0]), batched=True) else: alphas = dn.inputTensor([1], batched=True) return h_output, alphas # iterate through input states to compute attention scores # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs] w_a_h_t = self.w_a * h_t scores = [ self.v_a * dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input])) for h_input in encoded_inputs ] concatenated = dn.concatenate(scores) if input_masks: # if batching, multiply attention scores with input masks to zero-out scores for padded inputs dn_masks = dn.inputTensor(input_masks, batched=True) concatenated = dn.cmult(concatenated, dn_masks) # normalize scores alphas = dn.softmax(concatenated) # compute context vector with weighted sum for each seq in batch bo = dn.concatenate_cols(encoded_inputs) c = bo * alphas # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # compute output vector using current decoder state and context vector h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c])) return h_output, alphas
def tree2graph(tree): ''' Return the upper triangular adjacency matrix of the tree. ''' G_np = np.zeros((len(tree.sentence), len(tree.sentence))) for i in range(len(tree.sentence)): for j in range(i, len(tree.sentence)): if i == j: G_np[i][j] = 1 label, crossing = tree.span_labels(i, j) label = label[::-1] if (len(label) > 0): G_np[i, j] = 1 G = dy.inputTensor(G_np) return G
def get_embeddings(self, word_inds, tag_inds, is_train=False, train_bert_embedding=None): if is_train: self.char_lstm.set_dropout(self.dropout) else: self.char_lstm.disable_dropout() embeddings = [] for idx, (w, t) in enumerate(zip(word_inds, tag_inds)): if w > 2: count = self.vocab.word_freq_list[w] if not count or (is_train and np.random.rand() < self.unk_param / (self.unk_param + count)): w = 0 tag_embedding = self.tag_embeddings[t] chars = list(self.vocab.i2w[w]) if w > 2 else [self.vocab.i2w[w]] char_lstm_outputs = self.char_lstm.transduce([ self.char_embeddings[self.vocab.c2i[char]] for char in [Vocabulary.START] + chars + [Vocabulary.STOP] ]) char_embedding = dy.concatenate([ char_lstm_outputs[-1][:self.char_lstm_dim], char_lstm_outputs[0][self.char_lstm_dim:] ]) word_embedding = self.word_embeddings[w] embs = [tag_embedding, char_embedding, word_embedding] if train_bert_embedding is not None: if w != 0: embs.append(dy.inputTensor(train_bert_embedding[idx])) else: embs.append(dy.zeros(768)) embeddings.append(dy.concatenate(embs)) return embeddings
def calculate_loss(self, sents): dy.renew_cg() losses = [] for sent in sents: features, t_features, feat_reconstruct = self.get_features_for_tagging( sent, True ) gold_tags = [tag for chars, word, feats, tag in sent] cur_loss = self.crf_module.negative_log_loss( features, t_features, gold_tags ) if self.autoencoder: autoencoder_loss = [ dy.binary_log_loss(reconstruct, dy.inputTensor(feats)) for reconstruct, (chars, word, feats, tag) in zip( feat_reconstruct, sent ) ] else: # remove autoencoder loss autoencoder_loss = [dy.scalarInput(0)] losses.append(cur_loss + (dy.esum(autoencoder_loss) / self.featsize)) return dy.esum(losses)
def get_features(self, words, train=False, update=True): """ get feature representations """ # word embeddings wfeatures = np.array([self.get_w_repr(word, train=train, update=update) for word in words]) lex_features = [] if self.dictionary and not self.type_constraint: ## add lexicon features lex_features = np.array([self.get_lex_repr(word) for word in words]) # char embeddings if self.c_in_dim > 0: cfeatures = [self.get_c_repr(word, train=train) for word in words] if len(lex_features) > 0: lex_features = dynet.inputTensor(lex_features) features = [dynet.concatenate([w,c,l]) for w,c,l in zip(wfeatures,cfeatures,lex_features)] else: features = [dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures)] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe,self.noise_sigma) for fe in features] return features
def main(): parser = argparse.ArgumentParser( description= 'Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument( '--train_x_path', type=str, default='./data/train_x.txt', help='File path of train x data [default: `./data/train_x.txt`]') parser.add_argument( '--train_y_path', type=str, default='./data/train_y.txt', help='File path of train y data [default: `./data/train_x.txt`]') parser.add_argument( '--valid_x_path', type=str, default='./data/valid_x.txt', help='File path of valid x data [default: `./data/valid_x.txt`]') parser.add_argument( '--valid_y_path', type=str, default='./data/valid_y.txt', help='File path of valid y data [default: `./data/valid_y.txt`]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--batch_size', type=int, default=64, help='Mini batch size [default: 64]') parser.add_argument('--win_sizes', type=int, nargs='*', default=[3, 4, 5], help='Window sizes of filters [default: [3, 4, 5]]') parser.add_argument( '--num_fil', type=int, default=100, help='Number of filters in each window size [default: 100]') parser.add_argument('--s', type=float, default=3.0, help='L2 norm constraint on w [default: 3.0]') parser.add_argument('--dropout_prob', type=float, default=0.5, help='Dropout probability [default: 0.5]') parser.add_argument( '--v_strategy', type=str, default='static', help= 'Embedding strategy. rand: Random initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]' ) parser.add_argument( '--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) N_EPOCHS = args.n_epochs WIN_SIZES = args.win_sizes BATCH_SIZE = args.batch_size EMB_DIM = 300 OUT_DIM = 1 L2_NORM_LIM = args.s NUM_FIL = args.num_fil DROPOUT_PROB = args.dropout_prob V_STRATEGY = args.v_strategy ALLOC_MEM = args.alloc_mem if V_STRATEGY in ['rand', 'static', 'non-static']: NUM_CHA = 1 else: NUM_CHA = 2 # FILE paths W2V_PATH = './GoogleNews-vectors-negative300.bin' TRAIN_X_PATH = args.train_x_path TRAIN_Y_PATH = args.train_y_path VALID_X_PATH = args.valid_x_path VALID_Y_PATH = args.valid_y_path # DyNet setting dyparams = dy.DynetParams() dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load pretrained embeddings pretrained_model = gensim.models.KeyedVectors.load_word2vec_format( W2V_PATH, binary=True) vocab = pretrained_model.wv.vocab.keys() w2v = pretrained_model.wv # Build dataset ======================================================================================================= w2c = build_w2c(TRAIN_X_PATH, vocab=vocab) w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk') train_x, train_y = build_dataset(TRAIN_X_PATH, TRAIN_Y_PATH, w2i, unk='unk') valid_x, valid_y = build_dataset(VALID_X_PATH, VALID_Y_PATH, w2i, unk='unk') train_x, train_y = sort_data_by_length(train_x, train_y) valid_x, valid_y = sort_data_by_length(valid_x, valid_y) VOCAB_SIZE = len(w2i) print('VOCAB_SIZE:', VOCAB_SIZE) V_init = init_V(w2v, w2i) with open(os.path.join(RESULTS_DIR, './w2i.dump'), 'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'), 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w) # Build model ================================================================================= model = dy.Model() trainer = dy.AdamTrainer(model) # V1 V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) if V_STRATEGY in ['static', 'non-static', 'multichannel']: V1.init_from_array(V_init) if V_STRATEGY in ['static', 'multichannel']: V1_UPDATE = False else: # 'rand', 'non-static' V1_UPDATE = True make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) # V2 if V_STRATEGY == 'multichannel': V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) V2.init_from_array(V_init) V2_UPDATE = True make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) layers = [ CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh, DROPOUT_PROB), Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic) ] # Train model ================================================================================ n_batches_train = math.ceil(len(train_x) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] pred_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(train_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=False) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_train.append(mb_loss.value()) pred_all_train.extend(list(binary_pred(y.npvalue().flatten()))) # Backward prop mb_loss.backward() trainer.update() # L2 norm constraint layers[1].scale_W(L2_NORM_LIM) # Make padding embs zero if V_STRATEGY in ['rand', 'non-static']: make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) elif V_STRATEGY in ['multichannel']: make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) # Valid loss_all_valid = [] pred_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(valid_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=True) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_valid.append(mb_loss.value()) pred_all_valid.extend(list(binary_pred(y.npvalue().flatten()))) print( 'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]' % ( epoch + 1, np.mean(loss_all_train), f1_score(train_y, pred_all_train), accuracy_score(train_y, pred_all_train), np.mean(loss_all_valid), f1_score(valid_y, pred_all_valid), accuracy_score(valid_y, pred_all_valid), time.time() - start_time, )) # Save model ========================================================================================================================= if V_STRATEGY in ['rand', 'static', 'non-static']: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1] + layers) else: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1, V2] + layers)
def get_features_for_tagging(self, sentence, training): word_feats = [ dy.affine_transform( [ self.feat_b, self.feat_w, dy.inputTensor(feats.reshape(self.featsize, 1)), ] ) for chars, word, feats, tag in sentence ] zero_feats = [ dy.inputTensor(np.zeros(shape=(FEAT_OUT_SIZE, 1))) for chars, word, feats, tag in sentence ] # Non-linear transform for soft gazetteer features if self.feat_func == "tanh": word_feats = [dy.tanh(feat) for feat in word_feats] elif self.feat_func == "relu": word_feats = [dy.rectify(feat) for feat in word_feats] # Soft gazetteer features at the LSTM level if self.lstm_feats: cur_feats = word_feats else: cur_feats = zero_feats word_reps = [ dy.concatenate( [self.cnn.encode(chars, training), self.word_embeds[word], enc_feat] ) for enc_feat, (chars, word, feats, tag) in zip(cur_feats, sentence) ] contexts = self.word_lstm.transduce(word_reps) # Soft gazetteer features at the CRF level if self.crf_feats: cur_feats = word_feats else: cur_feats = zero_feats features = [ dy.affine_transform( [ self.context_to_emit_b, self.context_to_emit_w, dy.concatenate([context, feats]), ] ) for context, feats in zip(contexts, cur_feats) ] t_features = [ dy.reshape( dy.affine_transform( [ self.context_to_trans_b, self.context_to_trans_w, dy.concatenate([context, feats]), ] ), (self.num_tags, self.num_tags), ) for context, feats in zip(contexts, cur_feats) ] # Autoencoder feature reconstruction if self.lstm_feats: feat_reconstruct = [ dy.logistic( dy.affine_transform( [self.feat_reconstruct_b, self.feat_reconstruct_w, context] ) ) for context in contexts ] else: feat_reconstruct = [ dy.inputTensor(np.zeros(shape=(self.featsize,))) for context in contexts ] return features, t_features, feat_reconstruct
def generator(encoder, decoder, params_encoder, params_decoder, sentence, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split() ] s_vector = [] generate = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] input_word = "<start>" _lookup = params_decoder["lookup"] repeat = 0 while True: dy_env = dy.inputTensor(get_state_embed3(env)) repeat += 1 word = vocab_out.index(input_word) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_state = dy_sc_vector * weight_char encode_output = dy_s_vector * weight _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) top = 0 while True: top += 1 if top == 50: top = 1 break prediction = np.argsort(probs.vec_value())[-top] if (vocab_out[prediction] == '<end>'): break if (vocab_out[prediction] == '<start>'): continue new_env = str(execute(env, [vocab_out[prediction]])) if new_env == 'None': continue break prediction = np.argsort(probs.vec_value())[-top] input_word = vocab_out[prediction] if input_word == '<end>': break if repeat >= 10: break generate.append(input_word) env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' while '<start>' in generate: generate.remove('<start>') previous = s.output() return generate, previous
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence, output, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split(' ') ] loss = [] generate = [] s_vector = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] index = 1 input_word = "<start>" _lookup = params_decoder["lookup"] while True: dy_env = dy.inputTensor(get_state_embed3(env)) word = vocab_out.index(input_word) gt_y = vocab_out.index(output[index]) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_output = dy_s_vector * weight encode_state = dy_sc_vector * weight_char _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) prediction = np.argsort(probs.npvalue())[-1] if (vocab_out[prediction]) == '<start>': prediction = np.argsort(probs.npvalue())[-2] generate.append(vocab_out[prediction]) loss.append(-dy.log(dy.pick(probs, gt_y))) if output[index] == '<end>': break index += 1 input_word = vocab_out[prediction] if input_word == '<end>': continue env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' loss = dy.esum(loss) while '<start>' in generate: generate.remove('<start>') previous = s.output() return loss, generate, previous
def get_graph(self, embedding): dy.renew_cg() w = dy.parameter(self.pW) u = dy.parameter(self.pU) return u * dy.tanh(w * dy.inputTensor(embedding))