def build_model(first_level, model, emb_doc, doc_labels, w_param, b_param): """ Runs the model for training, calculating the loss. @params: first_level is I, O, or P, model is the LSTM model, emb_doc is a numpy array of embeddings for one document, doc_labels is a list of the labels associated with emb_doc, w_param is a Dynet parameter multiplied with the layer output, b_param is a Dynet parameter added to the product of output and w_param. @returns: the sum of the errors computed for the document """ dy.renew_cg() s = model.initial_state() i = dy.vecInput(200) o = dy.vecInput(200) p = dy.vecInput(200) si = s.add_input(i) so = s.add_input(o) sp = s.add_input(p) loss = [] for wdemb, label in zip(emb_doc, doc_labels): x = dy.inputVector(wdemb) dy.noise(x, 0.5) #noise for student model if first_level == 'I': s2 = si.add_input(x) elif first_level == 'O': s2 = so.add_input(x) else: s2 = sp.add_input(x) loss.append( dy.pickneglogsoftmax((w_param * s2.output()) + b_param, label)) return dy.esum(loss)
def predict(self, word_indices, char_indices, task_id, train=False): """ predict tags for a sentence represented as char+word embeddings """ # word embeddings wfeatures = [self.wembeds[w] for w in word_indices] # char embeddings if self.c_in_dim > 0: char_emb = [] rev_char_emb = [] # get representation for words for chars_of_token in char_indices: char_feats = [self.cembeds[c] for c in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats) last_state = f_char[-1] rev_last_state = b_char[-1] char_emb.append(last_state) rev_char_emb.append(rev_last_state) features = [dynet.concatenate([w,c,rev_c]) for w,c,rev_c in zip(wfeatures,char_emb,rev_char_emb)] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe,self.noise_sigma) for fe in features] output_expected_at_layer = self.predictors["task_expected_at"][task_id] output_expected_at_layer -=1 # go through layers # input is now combination of w + char emb prev = features prev_rev = features num_layers = self.h_layers for i in range(0,num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence(prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [self.activation(s) for s in forward_sequence] backward_sequence = [self.activation(s) for s in backward_sequence] if i == output_expected_at_layer: output_predictor = self.predictors["output_layers_dict"][task_id] concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))] if train and self.noise_sigma > 0.0: concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer] output = output_predictor.predict_sequence(concat_layer) return output prev = forward_sequence prev_rev = backward_sequence raise Exception("oops should not be here") return None
def build_tagging_graph_lvl1(words, tags, builders): dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] wembs = [E[w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] # fw_rnn_hidden_outs = [x.value() for x in fw] # bw_rnn_hidden_outs = [x.value() for x in bw] # print ("Transducing") # fw_rnn_hidden_outs = f_init.transduce(wembs) # bw_rnn_hidden_outs = b_init.transduce(reversed(wembs)) if MLP: H = dy.parameter(pH) O = dy.parameter(pO) else: O = dy.parameter(pO) errs = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) if MLP: r_t = O * (dy.tanh(H * f_b)) else: r_t = O * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return {'err': dy.esum(errs), 'fw': fw, 'bw': bw}
def build_tagging_graph(self, words, tags): dy.renew_cg() f_init, b_init = [b.initial_state() for b in self.first_layer_builders] wembs = [self.E[w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] errs = [] output_from_first_layer = [ dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw)) ] f_init, b_init = [ b.initial_state() for b in self.second_layer_builders ] fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)] bw = [ x.output() for x in b_init.add_inputs(reversed(output_from_first_layer)) ] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) r_t = self.pO * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
def build_tagging_graph(words, tags, builders): dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] wembs = [E[w] for w in words] wembs = [dy.noise(we,0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] if MLP: H = dy.parameter(pH) O = dy.parameter(pO) else: O = dy.parameter(pO) errs = [] for f,b,t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f,b]) if MLP: r_t = O*(dy.tanh(H * f_b)) else: r_t = O * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
def embed(self, x): if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = self.embeddings[x] if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = self.embeddings.batch(x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def build_tagging_graph(self, words): dy.renew_cg() # Initialize the LSTMs f_init = self.fwdRNN.initial_state() b_init = self.bwdRNN.initial_state() cf_init = self.cFwdRNN.initial_state() cb_init = self.cBwdRNN.initial_state() # Get the word vectors, a 128-dim vector expression for each word. if self.hp.dynamic: wembs = [self.dynamic_rep(w, cf_init, cb_init) for w in words] else: wembs = [ self.word_and_char_rep(w, cf_init, cb_init) for w in words ] if self.hp.noise > 0: wembs = [dy.noise(we, self.hp.noise) for we in wembs] # Feed word vectors into biLSTM fw_exps = f_init.transduce(wembs) bw_exps = b_init.transduce(reversed(wembs)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # Feed each biLSTM state to an MLP return [self.pO * (dy.tanh(self.pH * x)) for x in bi_exps]
def build_tagging_graph_for_chars(self, words): #self.lstm = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, model) wembs = self.convert_words_to_vecs(words) #wembs = [self.E[w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] f_init, b_init = [ b.initial_state() for b in self.char_flow_first_layer ] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] output_from_first_layer = [ dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw)) ] f_init, b_init = [ b.initial_state() for b in self.char_flow_second_layer ] fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)] bw = [ x.output() for x in b_init.add_inputs(reversed(output_from_first_layer)) ] vector_result = [] for f, b in zip(fw, reversed(bw)): f_b = dy.concatenate([f, b]) vector_result.append(f_b) return vector_result
def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression: if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if batchers.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] emb_e = dy.parameter(self.embeddings) # single mode if not batchers.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = dy.pick(emb_e, index=x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = dy.pick_batch(emb_e, x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def get_embeddings(self, words, is_train): if is_train: self.char_lstm.set_dropout(self.dropout) else: self.char_lstm.disable_dropout() embeddings = [] for pos, word in enumerate(words): count = self.word_vocab.count(word) if not count: word = UNK chars = list(word) char_lstm_outputs = self.char_lstm.transduce([ self.char_embeddings[self.char_vocab.index_or_unk(char, UNK)] for char in chars ]) char_embedding = dy.concatenate([ char_lstm_outputs[-1][:self.char_lstm_dim], char_lstm_outputs[0][self.char_lstm_dim:] ]) word_embedding = self.word_embeddings[self.word_vocab.index(word)] pos_embedding = self.pos_embeddings[pos] embeddings.append( dy.concatenate([char_embedding, word_embedding, pos_embedding])) embeddings = [dy.noise(e, 0.1) for e in embeddings] return embeddings
def get_embeddings(self, words, is_train): if is_train: self.char_lstm.set_dropout(self.dropout) else: self.char_lstm.disable_dropout() embeddings = [] for word in [START] + words + [STOP]: count = self.word_vocab.count(word) if not count or (is_train and np.random.rand() < 1 / (1 + count)): word = UNK chars = list(word) if word not in (START, STOP) else [word] char_lstm_outputs = self.char_lstm.transduce([ self.char_embeddings[self.char_vocab.index_or_unk(char, UNK)] for char in [START] + chars + [STOP] ]) char_embedding = dy.concatenate([ char_lstm_outputs[-1][:self.char_lstm_dim], char_lstm_outputs[0][self.char_lstm_dim:] ]) word_embedding = self.word_embeddings[self.word_vocab.index(word)] embeddings.append(dy.concatenate([char_embedding, word_embedding])) embeddings = [dy.noise(e, 0.1) for e in embeddings] return embeddings
def build_tagging_graph(words, tags, builders, topic): dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] # embeddings wembs = [E[w] for w in words] wembs = [dy.noise(we,0.1) for we in wembs] # bilstm fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] # MLP for tag prediction H = dy.parameter(pH) O = dy.parameter(pO) errs = [] for f,b,t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f,b]) r_t = O*(dy.tanh(H * f_b)) # r_t = O * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) # add an extra layer with MLP to predict topic if TOPIC: # aux_layer = dy.reshape(dy.parameter(topic_olayer) * dy.parameter(topic_hlayer),(5000,1)) * f_b[-1] aux_layer = dy.parameter(topic_olayer) * (dy.tanh(dy.parameter(topic_hlayer)*f_b)) aux_loss = dy.pickneglogsoftmax(aux_layer, topic) errs.append(aux_loss) return dy.esum(errs)
def build_tagging_graph(self, words, tags, builders): """ Builds the graph for a single sentence. :param words: :param tags: :param builders: :return: """ dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] wembs = [self.params["E"][w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] if self.use_mlp: H = dy.parameter(self.params["H"]) O = dy.parameter(self.params["O"]) else: O = dy.parameter(self.params["O"]) errs = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) if self.use_mlp: r_t = O * (dy.tanh(H * f_b)) else: r_t = O * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
def build_tagging_graph(self, words, tags): dy.renew_cg() #self.lstm = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, model) wembs = self.convert_words_to_vecs(words) wembs = [dy.noise(we, 0.1) for we in wembs] f_init, b_init = [b.initial_state() for b in self.first_layer] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] output_from_first_layer = [ dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw)) ] f_init, b_init = [b.initial_state() for b in self.second_layer] fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)] bw = [ x.output() for x in b_init.add_inputs(reversed(output_from_first_layer)) ] errs = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) r_t = self.pO * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
def __call__(self, p, train=True): """ Args: DyNet parameter (not expression) train: only apply noise if True Return: DyNet expression with weight noise applied if self.std > 0 """ p_expr = dy.parameter(p) if self.std > 0.0 and train: p_expr = dy.noise(p_expr, self.std) return p_expr
def build_tagging_graph_old(words, tags, template, builders, train=True): dy.renew_cg() if train and args.lstm_dropout is not None and args.lstm_dropout > 0: for b in builders: b.set_dropouts(args.lstm_dropout, args.lstm_dropout) f_init, b_init = [b.initial_state() for b in builders] wembs = [dy.lookup(pEmbedding, w) for w in words] if train: # Add noise in training as a regularizer wembs = [dy.noise(we, args.train_noise) for we in wembs] fw_states = [x for x in f_init.add_inputs(wembs)] bw_states = [x for x in b_init.add_inputs(reversed(wembs))] fw = [x.output() for x in fw_states] bw = [x.output() for x in bw_states] O = dy.parameter(pOutput) if args.mlp: H = dy.parameter(pHidden) errs = [] pred_tags = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) if args.mlp: f_b = dy.tanh(H * f_b) r_t = O * f_b if train: err = dy.pickneglogsoftmax(r_t, t) errs.append(err) else: out = dy.softmax(r_t) chosen = np.argmax(out.npvalue()) pred_tags.append(vocab_tags.i2w[chosen]) O_template = dy.parameter(pOutputTemplate) H_template = dy.parameter(pHiddenTemplate) f_bt = dy.concatenate([fw_states[-1].s()[0], bw_states[-1].s()[0]]) f_bt = dy.tanh(H_template * f_bt) r_tt = O_template * f_bt pred_template = None if train: err = dy.pickneglogsoftmax(r_tt, template) errs.append(err) else: out = dy.softmax(r_tt) chosen = np.argmax(out.npvalue()) pred_template = vocab_templates.i2w[chosen] return pred_tags, pred_template, errs
def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression: """ Embed a single word in a sentence. :param x: A word id. :return: Embedded word. """ ret = self._embed_word(x, batchers.is_batched(x)) ## Applying Fix normalization if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) * self.fix_norm ## Weight noise only when training if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def get_base_embeddings(trainmode, unkdtokens, tg_start, sentence): sentlen = len(unkdtokens) if trainmode: emb_x = [dy.noise(v_x[tok], 0.1) for tok in unkdtokens] else: emb_x = [v_x[tok] for tok in unkdtokens] pos_x = [p_x[pos] for pos in sentence.postags] dist_x = [dy.scalarInput(i - tg_start + 1) for i in range(sentlen)] baseinp_x = [(w_i * dy.concatenate([emb_x[j], pos_x[j], dist_x[j]]) + b_i) for j in range(sentlen)] if USE_WV: for j in range(sentlen): if unkdtokens[j] in wvs: nonupdatedwv = dy.nobackprop(e_x[unkdtokens[j]]) baseinp_x[j] = baseinp_x[j] + w_e * nonupdatedwv + b_e embposdist_x = [dy.rectify(baseinp_x[j]) for j in range(sentlen)] if USE_DROPOUT: basefwdlstm.set_dropout(DROPOUT_RATE) baserevlstm.set_dropout(DROPOUT_RATE) bfinit = basefwdlstm.initial_state() basefwd = bfinit.transduce(embposdist_x) brinit = baserevlstm.initial_state() baserev = brinit.transduce(reversed(embposdist_x)) basebi_x = [ dy.rectify( w_bi * dy.concatenate([basefwd[eidx], baserev[sentlen - eidx - 1]]) + b_bi) for eidx in range(sentlen) ] if USE_DEPS: dhead_x = [embposdist_x[dephead] for dephead in sentence.depheads] dheadp_x = [pos_x[dephead] for dephead in sentence.depheads] drel_x = [dr_x[deprel] for deprel in sentence.deprels] baseinp_x = [ dy.rectify(w_di * dy.concatenate( [dhead_x[j], dheadp_x[j], drel_x[j], basebi_x[j]]) + b_di) for j in range(sentlen) ] basebi_x = baseinp_x return basebi_x
def _build_computation_graph(self, words, train_mode=True): """ Builds the computational graph. """ dy.renew_cg() # turn parameters into expressions softmax_weight_exp = dy.parameter(self.softmax_weight) softmax_bias_exp = dy.parameter(self.softmax_bias) # initialize the RNNs f_init = self.fwd_word_rnn.initial_state() b_init = self.bwd_word_rnn.initial_state() # cf_init = self.fwd_char_rnn.initial_state() # cb_init = self.bwd_char_rnn.initial_state() # only use word-level for now word_reps = [self._word_rep(word) for word in words] if train_mode and self.add_word_noise: word_reps = [dy.noise(word_rep, 0.05) for word_rep in word_reps] # feed word vectors into biLSTM fw_exps = f_init.transduce(word_reps) bw_exps = b_init.transduce(reversed(word_reps)) if self.pooling_method == "last": average_lstm = dy.concatenate([fw_exps[-1], bw_exps[-1]]) else: bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] bi_exps = dy.concatenate(bi_exps, d=1) if self.pooling_method == "average": average_lstm = dy.mean_dim(bi_exps, d=1) elif self.pooling_method == "max": average_lstm = dy.max_dim(bi_exps, d=1) else: raise NotImplementedError if self.average_dropout is not None: average_lstm = dy.dropout(average_lstm, p=self.average_dropout) return softmax_weight_exp * average_lstm + softmax_bias_exp
def build_tagging_graph(words): dy.renew_cg() # parameters -> expressions H = dy.parameter(pH) O = dy.parameter(pO) # initialize the RNNs f_init = fwdRNN.initial_state() b_init = bwdRNN.initial_state() cf_init = cFwdRNN.initial_state() cb_init = cBwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [word_rep(w, cf_init, cb_init) for w in words] wembs = [dy.noise(we, 0.2) for we in wembs] # optional # feed word vectors into biLSTM fw_exps = f_init.transduce(wembs) bw_exps = b_init.transduce(reversed(wembs)) # OR # fw_exps = [] # s = f_init # for we in wembs: # s = s.add_input(we) # fw_exps.append(s.output()) # bw_exps = [] # s = b_init # for we in reversed(wembs): # s = s.add_input(we) # bw_exps.append(s.output()) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # feed each biLSTM state to an MLP exps = [] for x in bi_exps: r_t = O * (dy.tanh(H * x)) exps.append(r_t) return exps
def build_tagging_graph(words): dy.renew_cg() # parameters -> expressions H = dy.parameter(pH) O = dy.parameter(pO) # initialize the RNNs f_init = fwdRNN.initial_state() b_init = bwdRNN.initial_state() cf_init = cFwdRNN.initial_state() cb_init = cBwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [word_rep(w, cf_init, cb_init) for w in words] wembs = [dy.noise(we,0.2) for we in wembs] # optional # feed word vectors into biLSTM fw_exps = f_init.transduce(wembs) bw_exps = b_init.transduce(reversed(wembs)) # OR # fw_exps = [] # s = f_init # for we in wembs: # s = s.add_input(we) # fw_exps.append(s.output()) # bw_exps = [] # s = b_init # for we in reversed(wembs): # s = s.add_input(we) # bw_exps.append(s.output()) # biLSTM states bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))] # feed each biLSTM state to an MLP exps = [] for x in bi_exps: r_t = O*(dy.tanh(H * x)) exps.append(r_t) return exps
def build_tagging_graph(self, words, tags): dy.renew_cg() prefix_indices = [p[0] for p in words] suffix_indices = [p[2] for p in words] words = [p[1] for p in words] wembs = [] for w, p, s in zip(words, prefix_indices, suffix_indices): we = self.E[w] pe = self.preffix[p] se = self.suffix[s] wembs.append(dy.esum([we, pe, se])) f_init, b_init = [b.initial_state() for b in self.first_layer] #wembs = [self.Word_E[w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] output_from_first_layer = [ dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw)) ] f_init, b_init = [b.initial_state() for b in self.second_layer] fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)] bw = [ x.output() for x in b_init.add_inputs(reversed(output_from_first_layer)) ] errs = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) r_t = self.pO * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
def update_batch(self, words_batch, tags_batch): dynet.renew_cg() length = max(len(words) for words in words_batch) word_ids = np.zeros((length, len(words_batch)), dtype='int32') for j, words in enumerate(words_batch): for i, word in enumerate(words): word_ids[i, j] = self.vw.w2i.get(word, self.UNK) tag_ids = np.zeros((length, len(words_batch)), dtype='int32') for j, tags in enumerate(tags_batch): for i, tag in enumerate(tags): tag_ids[i, j] = self.vt.w2i.get(tag, self.UNK) wembs = [ dynet.lookup_batch(self._E, word_ids[i]) for i in range(length) ] wembs = [dynet.noise(we, 0.1) for we in wembs] f_state = self._fwd_lstm.initial_state() b_state = self._bwd_lstm.initial_state() fw = [x.output() for x in f_state.add_inputs(wembs)] bw = [x.output() for x in b_state.add_inputs(reversed(wembs))] H = dynet.parameter(self._pH) O = dynet.parameter(self._pO) errs = [] for i, (f, b) in enumerate(zip(fw, reversed(bw))): f_b = dynet.concatenate([f, b]) r_t = O * (dynet.tanh(H * f_b)) err = dynet.pickneglogsoftmax_batch(r_t, tag_ids[i]) errs.append(dynet.sum_batches(err)) sum_errs = dynet.esum(errs) squared = -sum_errs # * sum_errs losses = sum_errs.scalar_value() sum_errs.backward() self._sgd.update() return losses
def build_tagging_graph(words): lm_wembs = [] if HASLM: lm_wembs = calc_lm_embdding(words) dy.renew_cg() H = dy.parameter(pH) O = dy.parameter(pO) f_init = fwdRNN.initial_state() b_init = bwdRNN.initial_state() cf_init = cFwdRNN.initial_state() cb_init = cBwdRNN.initial_state() wembs = [word_rep(w, cf_init, cb_init) for w in words] if HASLM: wembs1 = [] for lmw, w in zip(lm_wembs, wembs): wv = w.value() wv.extend(lmw) wembs1.append(wv) wembs = [dy.inputTensor(w) for w in wembs1] wembs = [dy.noise(we, 0.1) for we in wembs] fw_exps = f_init.transduce(wembs) bw_exps = b_init.transduce(reversed(wembs)) bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] exps = [] for x in bi_exps: r_t = O * (dy.tanh(H * x)) exps.append(r_t) return exps
def build_tagging_graph(self, words, tags): dy.renew_cg() words_for_char = [w[1] for w in words] words = [w[0] for w in words] f_init, b_init = [b.initial_state() for b in self.word_first_layer] wembs = [self.Word_E[w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] output_from_first_layer = [ dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw)) ] f_init, b_init = [b.initial_state() for b in self.word_second_layer] fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)] bw = [ x.output() for x in b_init.add_inputs(reversed(output_from_first_layer)) ] errs = [] char_lstm_vectors = self.build_tagging_graph_for_chars(words_for_char) for f, b, chars_vec, t in zip(fw, reversed(bw), char_lstm_vectors, tags): f_b = dy.concatenate([f, b]) con_cat = dy.concatenate([f_b, chars_vec]) r_t = self.pO * con_cat err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
# Picking values from vector expressions e = dy.pick(e1, k) # k is unsigned integer, e1 is vector. return e1[k] e = e1[k] # same e = dy.pickrange( e1, k, v) # like python's e1[k:v] for lists. e1 is an Expression, k,v integers. e = e1[k:v] # same e = dy.pickneglogsoftmax( e1, k) # k is unsigned integer. equiv to: (pick(-log(dy.softmax(e1)), k)) # Neural net stuff dy.noise( e1, stddev ) # add a noise to each element from a gausian with standard-dev = stddev dy.dropout(e1, p) # apply dropout with probability p # functions over lists of expressions e = dy.esum([e1, e2, ...]) # sum e = dy.average([e1, e2, ...]) # average e = dy.concatenate_cols( [e1, e2, ...] ) # e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...]) e = dy.concatenate([e1, e2, ...]) # concatenate e = dy.affine_transform([e0, e1, e2, ...]) # e = e0 + ((e1*e2) + (e3*e4) ...) ## Loss functions e = dy.squared_distance(e1, e2)
def build_tagging_graph(words, tags, template, builders, train=True, k=1): dy.renew_cg() if train and args.lstm_dropout is not None and args.lstm_dropout > 0: for b in builders: b.set_dropouts(args.lstm_dropout, args.lstm_dropout) f_init, b_init = [b.initial_state() for b in builders] wembs = [dy.lookup(pEmbedding, w) for w in words] if train: # Add noise in training as a regularizer wembs = [dy.noise(we, args.train_noise) for we in wembs] fw_states = [x for x in f_init.add_inputs(wembs)] bw_states = [x for x in b_init.add_inputs(reversed(wembs))] fw = [x.output() for x in fw_states] bw = [x.output() for x in bw_states] O = dy.parameter(pOutput) if args.mlp: H = dy.parameter(pHidden) errs = [] pred_tags = [] sorted_arg_topk = [] final_topk = [] sequences_topk = [(0.0, list())] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) if args.mlp: f_b = dy.tanh(H * f_b) r_t = O * f_b if train: err = dy.pickneglogsoftmax(r_t, t) errs.append(err) else: out = dy.log_softmax(r_t) chosen = np.argmax(out.npvalue()) pred_tags.append(vocab_tags.i2w[chosen]) all_sequences = list() for seq in sequences_topk: seq_score, seq_list = seq _scores = -out.npvalue() arg_topk = np.argsort(_scores)[:k] score_topk = _scores[arg_topk] for i in range(min(k, len(arg_topk))): _list = list(seq_list) _list.append(vocab_tags.i2w[arg_topk[i]]) score = seq_score + score_topk[i] all_sequences.append((score, _list)) sequences_topk = sorted(all_sequences)[:k] O_template = dy.parameter(pOutputTemplate) H_template = dy.parameter(pHiddenTemplate) f_bt = dy.concatenate([fw_states[-1].s()[0], bw_states[-1].s()[0]]) f_bt = dy.tanh(H_template * f_bt) r_tt = O_template * f_bt pred_template = None if train: err = dy.pickneglogsoftmax(r_tt, template) errs.append(err) else: out = dy.log_softmax(r_tt) _scores = -out.npvalue() chosen = np.argmin(_scores) pred_template = vocab_templates.i2w[chosen] sorted_arg_topk = np.argsort(_scores)[:k] all_sequences_and_templates = [] for template_id in sorted_arg_topk: _score = _scores[template_id] _template = vocab_templates.i2w[template_id] for seq_score, seq_list in sequences_topk: all_sequences_and_templates.append( (_score + seq_score, seq_list, _template)) final_topk = sorted(all_sequences_and_templates)[:k] return pred_tags, pred_template, errs, final_topk
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, self.c2i)): conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1] # char_state = dynet.noise(concatenate([last_state, rev_last_state]), 0.2) # morph_logit = self.charSeqPredictor.predict_sequence(char_state) # morphID = self.morphs.get(entry.feats) # morphErrs.append(self.pick_neg_log(morph_logit, morphID)) # morph_emb = None # for i in morph_logit: # morph_emb += i * self.mlookup(i) entry.vec = concatenate(filter(None, [wordvec, evec, last_state, rev_last_state])) entry.ch_vec = concatenate([dynet.noise(fe,0.2) for fe in filter(None, [last_state, rev_last_state])]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: morcat_layer = [entry.ch_vec for entry in conll_sentence] morph_logits = self.charSeqPredictor.predict_sequence(morcat_layer) predicted_morph_idx = [np.argmax(o.value()) for o in morph_logits] predicted_morphs = [self.id2morph[idx] for idx in predicted_morph_idx] for builder in self.pos_builder: builder.disable_dropout() lstm_forward = self.pos_builder[0].initial_state() lstm_backward = self.pos_builder[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() pos_embed = [] concat_layer = [concatenate(entry.lstms) for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer) predicted_posIDs = [np.argmax(o.value()) for o in outputFFlayer] predicted_postags = [self.id2pos[idx] for idx in predicted_posIDs] for predID, pred in zip(predicted_posIDs, outputFFlayer): if self.gold_pos: pos_embed.append(self.plookup[predID]) else: pos_embed.append(soft_embed(pred.value(), self.plookup)) for entry in conll_sentence: entry.vec = concatenate(entry.lstms) for builder in self.dep_builders: builder.disable_dropout() blstm_forward = self.dep_builders[0].initial_state() blstm_backward = self.dep_builders[1].initial_state() for entry, rentry, pembed, revpembed in zip(conll_sentence, reversed(conll_sentence), pos_embed, reversed(pos_embed)): blstm_forward = blstm_forward.add_input(concatenate([entry.vec, pembed])) blstm_backward = blstm_backward.add_input(concatenate([rentry.vec, revpembed])) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) #Multiple roots: heading to the previous "rooted" one rootCount = 0 rootWid = -1 for index, head in enumerate(heads): if head == 0: rootCount += 1 if rootCount == 1: rootWid = index if rootCount > 1: heads[index] = rootWid rootWid = index for entry, head, pos, feats in zip(conll_sentence, heads, predicted_postags, predicted_morphs): entry.pred_parent_id = head entry.pred_relation = '_' entry.pred_pos = pos entry.pred_feats = feats dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def __applyNoise(self, exp, train): if self.__noise == None or not train: return exp return dynet.noise(exp, self.__noise)
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, self.c2i)) random.shuffle(shuffledData) errs = [] lerrs = [] posErrs = [] eeloss = 0.0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 500 == 0 and iSentence != 0: print "Processing sentence number: %d" % iSentence, ", Loss: %.2f" % ( eloss / etotal), ", Time: %.2f" % (time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0) ) if dropFlag else 0] if self.wdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get( entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] #entry.vec = concatenate(filter(None, [wordvec, evec])) last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in reversed(entry.idChars)])[-1] entry.vec = concatenate([ dynet.noise(fe, 0.2) for fe in filter( None, [wordvec, evec, last_state, rev_last_state]) ]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) goldLabelInd = self.rels[conll_sentence[modifier + 1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) concat_layer = [ concatenate(entry.lstms) for entry in conll_sentence ] concat_layer = [dynet.noise(fe, 0.2) for fe in concat_layer] outputFFlayer = self.ffSeqPredictor.predict_sequence( concat_layer) posIDs = [self.pos.get(entry.pos) for entry in conll_sentence] for pred, gold in zip(outputFFlayer, posIDs): posErrs.append(self.pick_neg_log(pred, gold)) if iSentence % 1 == 0 or len(errs) > 0 or len( lerrs) > 0 or len(posErrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0: eerrs = (esum(errs + lerrs + posErrs) ) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] posErrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs + posErrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] posErrs = [] eeloss = 0.0 renew_cg() self.trainer.update() print "Loss: %.2f" % (mloss / iSentence)
def predict(self, features, task_name, train=False): """ Steps through the computation graph and obtains predictions for the provided input features. :param features: a list of word embeddings for every word in the sequence :param task_name: the name of the task that should be predicted :param train: if the model is training; apply noise in this case :return output: the output predictions penalty: the summed subspace penalty (0 if no constraint) """ if train: # noise is added only at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] # only if we use cross-stitch we have a layer for each task; # otherwise we just have one layer for all tasks num_layers = self.h_layers inputs = [features] * len(self.task_names) inputs_rev = [features] * len(self.task_names) target_task_id = self.task_names.index( task_name) if self.cross_stitch else 0 # collect the forward and backward sequences for each task at every # layer for the layer connection units layer_forward_sequences = [] layer_backward_sequences = [] penalty = dynet.const_parameter(self.subspace_penalty) for i in range(0, num_layers): forward_sequences = [] backward_sequences = [] for j in range(num_task_layers): predictor = self.predictors['inner'][i][j] forward_sequence, backward_sequence = predictor.predict_sequence( inputs[j], inputs_rev[j]) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] forward_sequences.append(forward_sequence) backward_sequences.append(backward_sequence) if self.num_subspaces == 2 and self.constraint_weight != 0: # returns a list per layer, i.e. here a list with one item lstm_parameters = \ predictor.builder.get_parameter_expressions()[0] # lstm parameters consists of these weights: # Wix,Wih,Wic,bi,Wox,Woh,Woc,bo,Wcx,Wch,bc for param_idx in range(len(lstm_parameters)): if param_idx in self.constrain_matrices: W = lstm_parameters[param_idx] W_shape = np.array(W.value()).shape if (len(W_shape) < 2): W_shape = [W_shape[0], 1] # split matrix into its two subspaces W_subspaces = dynet.reshape( W, (self.num_subspaces, W_shape[0] / float(self.num_subspaces), W_shape[1])) subspace_1, subspace_2 = W_subspaces[ 0], W_subspaces[1] # calculate the matrix product of the two matrices matrix_product = dynet.transpose( subspace_1) * subspace_2 # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) penalty += squared_frobenius_norm if self.cross_stitch: # takes as input a list of input lists and produces a list of # outputs where the index indicates the task forward_sequences = self.predictors['cross_stitch'][i].stitch( forward_sequences) backward_sequences = self.predictors['cross_stitch'][i].stitch( backward_sequences) inputs = forward_sequences inputs_rev = backward_sequences layer_forward_sequences.append(forward_sequences) layer_backward_sequences.append(backward_sequences) if i == num_layers - 1: output_predictor = \ self.predictors['output_layers_dict'][task_name] # get the forward/backward states of all task layers task_forward_sequences = [ layer_seq_list[target_task_id][-1] for layer_seq_list in layer_forward_sequences ] task_backward_sequences = [ layer_seq_list[target_task_id][0] for layer_seq_list in layer_backward_sequences ] if (num_layers > 1): forward_input = \ self.predictors['layer_stitch'][ target_task_id].stitch(task_forward_sequences) backward_input = \ self.predictors['layer_stitch'][ target_task_id].stitch(task_backward_sequences) else: forward_input = task_forward_sequences[0] backward_input = task_backward_sequences[0] concat_layer = dynet.concatenate( [forward_input, backward_input]) if train and self.noise_sigma > 0.0: concat_layer = dynet.noise(concat_layer, self.noise_sigma) output = [] if ('sentiment' in task_name): #Multi-label for i in range(len(output_predictor)): output.append(output_predictor[i](concat_layer)) else: output.append(output_predictor(concat_layer)) #output = output_predictor.predict_sequence(concat_layer) return output, penalty raise Exception('Error: This place should not be reached.')