def generate(self, h_a, trg, maxlen=100): #decode(self, h_a, trg, decorate=False): h_a += ([dy.zeros(self.hdim)] * (self.max_len - len(h_a)) ) #padding to make equal to maxlength h_ak = dy.concatenate(h_a, 1) #pdb.set_trace() pre_attend = dy.parameter(self.pre_attend) context = h_ak * pre_attend prev_out = dy.zeros((self.hdim)) outputs = [] s = self.decoder_rnn.initial_state() for i in range(maxlen): attender = dy.parameter(self.attender) #pdb.set_trace() V = dy.parameter(self.v) tmp = dy.tanh(dy.colwise_add(context, V * prev_out)) U = dy.parameter(self.u) attention_weights = dy.softmax(dy.transpose(U * tmp)) #pdb.set_trace() emb = dy.concatenate([h_ak * attention_weights, prev_out]) s = s.add_input(emb) prev_out = s.output() pre2 = dy.parameter(self.pred) pre2 * prev_out outputs.append(pre2 * prev_out) act_value = pre2 * prev_out act_value = np.argmax(act_value.value()) outputs.append(act_value) if act_value == 1: return outputs return outputs
def next_action(self, state, src_len, enc_len): if self.policy_learning is None: if state.has_been_read < src_len: return self.Action.READ else: return self.Action.WRITE else: # Sanity Check here: force_action = [ self.Action.READ.value ] if enc_len == 0 else None # No writing at the beginning. force_action = [ self.Action.WRITE.value ] if enc_len == src_len else force_action # No reading at the end. # Compose inputs from 3 states encoder_state = state.encoder_state.output() enc_dim = encoder_state.dim() context_state = state.context_state.as_vector( ) if state.context_state else dy.zeros(*enc_dim) output_embed = state.output_embed if state.output_embed else dy.zeros( *enc_dim) input_state = dy.concatenate( [encoder_state, context_state, output_embed]) # Sample / Calculate a single action action = self.policy_learning.sample_action( input_state, predefined_actions=force_action, argmax=not self.train)[0] return self.Action(action)
def extract_features(stack: List[int], buffer: List[int], bilstm_repr): """ Return For 3 stack tokens & 1 buffer token: concatenated [stack_n-2, stack_n-1, stack_n, buffer_0] In case the stack has only 2 elements (or less), put zeros: concatenated [zero-vector, stack_n-1, stack_n, buffer_0] """ # Get the positions of the tokens in the sentence. stack_token_positions = stack[-NO_STACK_FEATURES:] buffer_tokens_positions = buffer[:NO_BUFFER_FEATURES] # Get the bilstm representations. stack_bilstms = [bilstm_repr[i] for i in stack_token_positions] buffer_bilstms = [bilstm_repr[i] for i in buffer_tokens_positions] # Add zero-valued vectors if not enough features. no_missing_stack_features = NO_STACK_FEATURES - len(stack_bilstms) stack_bilstms = [dy.zeros(BILSTM_STATE_SIZE) ] * no_missing_stack_features + stack_bilstms no_missing_buffer_features = NO_BUFFER_FEATURES - len(buffer) buffer_bilstms += [dy.zeros(BILSTM_STATE_SIZE) ] * no_missing_buffer_features # Put stack & buffer features together in a list. features_list = stack_bilstms + buffer_bilstms # Concatenate the feature vectors. features = dy.concatenate(features_list) return features
def _vaswani_model_init(e): w_embs = [w2e[idx] for idx in e["tk_words"]] if cfg["use_postags"]: pos_embs = [pos2e[idx] for idx in e["tk_postags"]] i_embs = [ dy.concatenate([w_embs[i], pos_embs[i]]) for i in xrange(len(e["tk_words"])) ] else: i_embs = w_embs f_init = fwd.initial_state() b_init = bwd.initial_state() lm_init = lm.initial_state() f_hs = dy.concatenate_cols(f_init.transduce(i_embs)) b_hs = dy.concatenate_cols(b_init.transduce(reversed(i_embs))[::-1]) out_c1 = dy.rectify(c1_Wf * f_hs + c1_Wb * b_hs) aux_c2 = c2_Wc * out_c1 m = { "aux_c2": aux_c2, "beam_lm_states": [lm_init], "beam_lm_hs": dy.zeros((cfg["lm_h_dim"], 1)), "idx": 0 } if cfg["accumulate_scores"]: m["acc_scores"] = dy.zeros((1, 1)) return m
def _initialize_discourse_states(self): discourse_state = self.initial_discourse_state discourse_lstm_states = [lstm.initial_state([dy.zeros((lstm.spec[2],)), dy.zeros((lstm.spec[2],))]) for lstm in self.discourse_lstms] return discourse_state, discourse_lstm_states
def helper(): label_scores = [] for x in lstm_outputs[1:-1]: label_score = self.f_label(x) label_scores.append(label_score) if use_crf: tags, viterbi_scores = viterbi_decoding(label_scores, gold) if is_train and tags != gold: gold_scores = forced_decoding(label_scores, gold) total_loss = viterbi_scores - gold_scores else: total_loss = dy.zeros(1) else: total_loss = dy.zeros(1) tags = [] if is_train: losses = [] for label_score, tag in zip(label_scores, gold): tag_index = self.tag_vocab.index(tag) loss = dy.pickneglogsoftmax(label_score, tag_index) losses.append(loss) total_loss = dy.esum(losses) else: label_scores = [dy.softmax(ls) for ls in label_scores] probs = [ls.npvalue() for ls in label_scores] for prob in probs: tag_index = np.argmax(prob) tag = self.tag_vocab.value(tag_index) tags.append(tag) return tags, total_loss
def initial_state(self): _init_h = dy.zeros((self.hidden_dim, )) _init_m = dy.zeros((self.hidden_dim, )) self.Wi = self.W_i.expr() self.Wf = self.W_f.expr() self.Wo = self.W_o.expr() self.Wu = self.W_u.expr() _init_s = LSTMState(self, -1, hidden=_init_h, memory=_init_m) return _init_s
def transduce(self, input, hx=None, cx=None): hx = hx if hx is not None else dy.zeros((self.n_hidden)) cx = cx if cx is not None else dy.zeros((self.n_hidden)) output = [] cells = [] for x in input: hx, cx = self.step(x, hx, cx) output.append(hx) cells.append(cx) return output, cells
def gru_step(self, word, h_prev): if h_prev is None: h_prev = dy.zeros(HIDDEN_SIZE) if word not in self.vocabs: x = dy.zeros(EMBEDDING_DIMENSION) else: x = dy.lookup(self.embedding, self.vocabs[word]) r = dy.logistic(self.w_xr * x + self.w_hr * h_prev + self.b_r) z = dy.logistic(self.w_xz * x + self.w_hz * h_prev + self.b_z) c_h = dy.tanh(self.w_xh * x + self.w_hh * dy.cmult(r, h_prev) + self.b_h) return dy.cmult(1 - z, h_prev) + dy.cmult(z, c_h)
def evaluate(self, input_sentences, labels): dy.renew_cg() self.word_rnn.disable_dropout() self.sent_rnn.disable_dropout() embed_sents = [] for input_sentence in input_sentences: input_sentence = self._preprocess_input(input_sentence, self.word_to_ix) #input_sentence = [self.word_to_ix['<start>']] + input_sentence + [self.word_to_ix['<end>']] embed_words = self._embed_sentence(input_sentence) word_rnn_outputs = self._run_rnn(self.word_rnn, embed_words) sent_embed = dy.average(word_rnn_outputs) embed_sents.append(sent_embed) rnn_outputs = self._run_rnn(self.sent_rnn, embed_sents) doc_output_w = dy.parameter(self.doc_output_w) doc_output_b = dy.parameter(self.doc_output_b) doc_output = dy.tanh(doc_output_w * dy.average(rnn_outputs) + doc_output_b) probs = [] sum_output = dy.zeros(self.args.sent_hidden_dim) pred_labels = [] correct = 0 total = 0 loss = dy.zeros(1) for i, rnn_output in enumerate(rnn_outputs): abspos_embed = dy.lookup(self.abspos_embeddings, self.abspos_ix[i]) relpos_embed = dy.lookup(self.relpos_embeddings, self.relpos_ix[i]) prob = self._get_probs(rnn_output, doc_output, sum_output, abspos_embed, relpos_embed) sum_output += dy.cmult(prob, rnn_output) pred_label = self._predict(prob) pred_labels.append(pred_label) if pred_label == labels[i]: correct += 1 total += 1 if labels[i] == 1: loss -= dy.log(prob) else: loss -= dy.log(dy.scalarInput(1) - prob) return loss.value(), pred_labels, correct, total
def embed(self, x): if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = self.embeddings[x] if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = self.embeddings.batch(x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression: if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if batchers.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] emb_e = dy.parameter(self.embeddings) # single mode if not batchers.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = dy.pick(emb_e, index=x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = dy.pick_batch(emb_e, x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def _next_action(self, state, src_len, force_action=None) -> PolicyAction: # Sanity Check here: if force_action is None: force_action = self.Action.READ.value if state.has_been_read == 0 else force_action # No writing at the beginning. force_action = self.Action.WRITE.value if state.has_been_read == src_len else force_action # No reading at the end. if self.read_before_write: force_action = self.Action.READ.value if state.has_been_read < src_len else self.Action.WRITE.value # Compose inputs from 3 states if self.policy_network is not None: enc_dim = self.src_encoding[0].dim() encoder_state = state.encoder_state if state.encoder_state is not None else dy.zeros( *enc_dim) decoder_state = state.decoder_state.as_vector( ) if state.decoder_state is not None else dy.zeros(*enc_dim) policy_input = dy.nobackprop( dy.concatenate([encoder_state, decoder_state])) predefined_action = [force_action ] if force_action is not None else None # Sample / Calculate a single action policy_action = self.policy_network.sample_action( policy_input, predefined_actions=predefined_action, argmax=not (self.train and self.policy_sample)) policy_action.single_action() else: policy_action = PolicyAction(force_action) # TODO(philip30): Update this value when you add more actions if policy_action.content > 2: import random policy_action.content = random.randint(0, 1) return policy_action
def beam_decode(self, encodings, input_len=10, beam_size=1): batch_size = 1 self.__dec.init_params(encodings, batch_size, self.__train_flag) context = dy.zeros((self.__enc.output_dim, )) beams = [Beam(self.__dec.dec_state, context, [self.__trg_sos], 0.0)] for i in xrange(int(min(self.__max_len, input_len * 1.5))): new_beams = [] p_list = [] for b in beams: if b.words[-1] == self.__trg_eos: p_list.append(dy.ones((self.__trg_vsize, ))) continue hidden, embs, b.state = self.__dec.next([b.words[-1]], b.context, self.__train_flag, b.state) b.context, _ = self.attend(encodings, hidden) score = self.__dec.score(hidden, b.context, embs, self.__train_flag) p_list.append(dy.softmax(score)) p_list = dy.concatenate_to_batch(p_list).npvalue().T.reshape(-1, self.__trg_vsize) for p, b in zip(p_list, beams): p = p.flatten() / p.sum() kbest = np.argsort(p) if b.words[-1] == self.__trg_eos: new_beams.append(Beam(b.state, b.context, b.words, b.log_prob)) else: for next_word in kbest[-beam_size:]: new_beams.append(Beam(b.state, b.context, b.words + [next_word], b.log_prob + np.log(p[next_word]))) beams = sorted(new_beams, key=lambda b: b.log_prob)[-beam_size:] if beams[-1].words[-1] == self.__trg_eos: break return beams[-1].words
def sentence_block_embed(self, embed, x, mask): batch, length = x.shape x_mask = mask.reshape((batch * length,)) _, units = embed.shape() # According to updated Dynet e = dy.concatenate_cols([dy.zeros(units) if x_mask[j] == 1 else dy.lookup(embed, id_) for j, id_ in enumerate(x.reshape((batch * length,)))]) e = dy.reshape(e, (units, length), batch_size=batch) return e
def adapt(s2s, trainer, X, Y, n_epochs, check_train_error_every): timer = utils.Timer() log = utils.Logger(True) n_train = len(X) n_tokens = (sum(map(len, Y)) - len(Y)) s2s.set_train_mode() s2s.reset_usr_vec() # Train for n_iter for epoch in range(n_epochs): dy.renew_cg() loss = dy.zeros((1, )) timer.restart() # Add losses for all samples for x, y in zip(X, Y): loss += s2s.calculate_user_loss([x], [y]) # Backward + update loss.backward() trainer.update() # Record metrics if n_train > 0 and epoch % check_train_error_every == 0: train_loss = loss.value() / n_tokens train_ppl = np.exp(train_loss) trainer.status() elapsed = timer.tick() log.info(" Training_loss=%f, ppl=%f, time=%f s, tok/s=%.1f" % (train_loss, train_ppl, elapsed, n_tokens / elapsed))
def expr_for_tree(self,xt,tree,node,is_train): if is_train: # in the training phase, perform dropout W_dropout = dy.dropout(self.WP, self.dropout_rate) WR_dropout = dy.dropout(self.WR, self.dropout_rate) WC_dropout = dy.dropout(self.WC, self.dropout_rate) else: W_dropout = self.WP WR_dropout = self.WR WC_dropout = self.WC if node is None or node.is_leaf(): Wx = W_dropout * xt # h = dy.tanh(Wx + self.bc) h = dy.tanh(dy.affine_transform([self.bc, self.WC, xt])) return h #get child nodes children=tree.children(node.identifier) children_sum=dy.zeros((self.n_out)) for i in range(len(children)): hc=self.expr_for_tree(xt=xt,tree=tree,node=children[i],is_train=is_train) rt = dy.logistic(self.WR * xt +self.UR*hc+self.br) children_sum=children_sum+dy.cmult(rt, hc) Wx = W_dropout * xt h = dy.tanh(Wx + self.bp+self.UP*children_sum) return h
def _policy_shape_probs(self, prob_dist): # TODO: this is specific to Alchemy num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 new_probdist = dy.zeros(prob_dist.dim()[0]) zeroes = numpy.zeros(num_locations * num_arguments) ones = numpy.ones(num_locations * num_arguments) eos_prob = prob_dist[self._all_output_vocabulary.lookup_index((EOS, NO_ARG, NO_ARG))] action_idx = 0 for action in self.output_action_vocabulary: masks = numpy.concatenate( (numpy.repeat(zeroes, action_idx), ones, numpy.repeat(zeroes, num_actions - action_idx - 1))) actions_masks = dy.reshape(dy.inputTensor(masks), (num_actions * num_locations * num_arguments, 1)) if action == EOS: new_probdist += dy.cmult(actions_masks, prob_dist) / 2. elif action == "push": new_probdist += dy.cmult(actions_masks, prob_dist) + eos_prob / (2. * 56.) elif action == "pop": new_probdist += dy.cmult(actions_masks, prob_dist) if self.args.syntax_restricted: return dy.exp(dy.log_softmax(dy.cmult(new_probdist, prob_dist), restrict = self._valid_action_indices)) else: return dy.softmax(dy.cmult(new_probdist, prob_dist))
def hinge_loss(exprs, target, margin=1.0): scores = exprs.value() best_wrong = max([(i, sc) for i, sc in enumerate(scores) if i != target], key=lambda x: x[1])[0] if scores[target] < scores[best_wrong] + margin: return exprs[best_wrong] - exprs[target] + margin else: return dy.zeros(1)
def generate(self, context, trg, decorate=False, maxpossible=100): #greedy generation! prev_out=dy.zeros((self.hdim)) outputs=[] for i in range(maxpossible): emb=dy.concatenate([context, prev_out]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1= dy.parameter(self.UFS[0]) bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] #import pdb;pdb.set_trace() i = dy.logistic(bi+Ui*emb) o = dy.logistic(bi+Uo*emb) f = dy.logistic(bf+Uf1*emb) #print("hey") u = dy.tanh(bu+Uu*emb) c = dy.cmult(i,u) + dy.cmult(f,prev_out) h = dy.cmult(o,dy.tanh(c)) if decorate: tree._e = h prev_out=c #pre1=dy.parameter(self.pre_l) pre2=dy.parameter(self.pred) out=dy.log_softmax(pre2*h) out=np.argmax(out) outputs.append(out) if out==1: print(outputs) print("-----") print(trg) return outputs print(outputs) print("---") print(trg) return outputs
def __call__(self, inputs, is_train=True): """ :param inputs: input word embeddings :param is_train: train flag, used for dropout :return: """ seq_len = len(inputs) h = dy.zeros((self.n_out,)) c = dy.zeros((self.n_out,)) H = [] for t in range(seq_len): xt = inputs[t] h, c = self.recurrence(xt, h, c, train_flag=is_train) H.append(h) return H
def span_parser(self, sentence, is_train, elmo_embeddings, cur_word_index, gold=None): if gold is not None: assert isinstance(gold, ParseNode) lstm_outputs = self._featurize_sentence(sentence, is_train=is_train, elmo_embeddings=elmo_embeddings, cur_word_index=cur_word_index) encodings = [] span_to_index = {} for start in range(0, len(sentence)): for end in range(start + 1, len(sentence) + 1): span_to_index[(start, end)] = len(encodings) encodings.append(self._get_span_encoding(start, end, lstm_outputs)) label_log_probabilities = self._encodings_to_label_log_probabilities(encodings) total_loss = dy.zeros(1) if is_train: for start in range(0, len(sentence)): for end in range(start + 1, len(sentence) + 1): gold_label = gold.oracle_label(start, end) gold_label_index = self.label_vocab.index(gold_label) index = span_to_index[(start, end)] total_loss -= label_log_probabilities[gold_label_index][index] return None, total_loss else: label_log_probabilities_np = label_log_probabilities.npvalue() tree, additional_info = optimal_parser(label_log_probabilities_np, span_to_index, sentence, self.empty_label_index, self.label_vocab, gold) return tree, additional_info, dy.exp(label_log_probabilities).npvalue()
def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda:{})): if self.elmo: # Get full text of sentence - excluding root, which is loaded differently # for transition and graph-based parsers. if options.graph_based: sentence_text = " ".join([entry.form for entry in sentence[1:]]) else: sentence_text = " ".join([entry.form for entry in sentence[:-1]]) elmo_sentence_representation = \ self.elmo.get_sentence_representation(sentence_text) for i, root in enumerate(sentence): root.vecs = defaultdict(lambda: None) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count/(0.25+word_count) root.vecs["word"] = self.word_lookup[self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector(test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos,0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector(root,train,test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[self.treebanks[treebank_id]] if self.elmo: if i < len(sentence) - 1: # Don't look up the 'root' word root.vecs["elmo"] = elmo_sentence_representation[i] else: # TODO root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim) root.vec = dy.concatenate(list(filter(None, [root.vecs["word"], root.vecs["elmo"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"]]))) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence,train)
def predict_sequence_batched(self, inputs, mask_array, wlen, predictFlag=False): batch_size = inputs[0].dim()[1] src_len = len(inputs) if not predictFlag: self.charlstm.set_dropouts(self.dropout, self.dropout) self.charlstm.set_dropout_masks(batch_size) char_fwd = self.charlstm.initial_state(batch_size) recur_states, cells = char_fwd.add_inputs(inputs, mask_array, predictFlag) hidden_states = [] for idx in range(src_len): mask = dy.inputVector(mask_array[idx]) mask_expr = dy.reshape(mask, (1, ), batch_size) hidden_states.append(recur_states[idx] * mask_expr) H = dy.concatenate_cols(hidden_states) if (predictFlag): a = dy.softmax(dy.transpose(self.W_atten.expr()) * H) else: #dropout attention connections(keep the same dim across the sequence) a = dy.softmax( dy.transpose(self.W_atten.expr()) * dy.dropout_dim(H, 1, self.dropout)) cell_states = [] for idx in range(batch_size): if (wlen[idx] > 0): cell = dy.pick_batch_elem(cells[wlen[idx] - 1], idx) else: cell = dy.zeros(self.ldims) cell_states.append(cell) C = dy.concatenate_to_batch(cell_states) H_atten = H * dy.transpose(a) char_emb = dy.concatenate([H_atten, C]) if predictFlag: proj_char_emb = dy.affine_transform( [self.b_linear.expr(), self.W_linear.expr(), char_emb]) else: proj_char_emb = dy.affine_transform([ self.b_linear.expr(), self.W_linear.expr(), dy.dropout(char_emb, self.dropout) ]) return proj_char_emb
def __call__(self, input_expr): W1 = dy.parameter(self.W1) if self.bias: b1 = dy.parameter(self.b1) else: b1 = dy.zeros(self.output_dim) return dy.affine_transform([b1, W1, input_expr])
def push(self, input, idx): ''' :param input: :param idx: word idx in buffer or action_id in vocab :return: ''' if len(self.states) == 0: init_h, init_c = dy.zeros((self.hidden_size)), dy.zeros( (self.hidden_size)) hx, cx = self.cell.step(input, init_h, init_c) else: pre_hx, pre_cx = self.states[-1] hx, cx = self.cell.step(input, pre_hx, pre_cx) self.states.append((hx, cx)) #self.states.append((self.linear(input), None)) self.indices.append(idx)
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: mask = es.mask sent_len = len(es) es_expr = es.as_transposed_tensor() batch_size = es_expr.dim()[1] es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) h_out = {} for direction in ["fwd", "bwd"]: # input convolutions gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]), dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False) gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)] h = [] c = [] for input_pos in range(sent_len): directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1 gates_t = gates_xt_bias_list[directional_pos] if input_pos > 0: # recurrent convolutions gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False) gates_t += gates_h_t # standard LSTM logic if len(c) == 0: c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size) else: c_tm1 = c[-1] gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size) c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,), batch_size=batch_size) h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped) h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size) if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False)) h.append( mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False)) h_out[direction] = h ret_expr = [] for state_i in range(len(h_out["fwd"])): state_fwd = h_out["fwd"][state_i] state_bwd = h_out["bwd"][-1 - state_i] output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],) fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size) bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size) ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2)) return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask) # TODO: implement get_final_states()
def train_on_partial_annotation(self, sentence, annotations, elmo_vecs, cur_word_index): if len(annotations) == 0: return dy.zeros(1) lstm_outputs = self._featurize_sentence(sentence, is_train=True, elmo_embeddings=elmo_vecs, cur_word_index=cur_word_index) encodings = [] for annotation in annotations: assert 0 <= annotation.left < annotation.right <= len(sentence), \ (0, annotation.left, annotation.right, len(sentence)) encoding = self._get_span_encoding(annotation.left, annotation.right, lstm_outputs) encodings.append(encoding) label_log_probabilities = self._encodings_to_label_log_probabilities(encodings) total_loss = dy.zeros(1) for index, annotation in reversed(list(enumerate(annotations))): loss = - label_log_probabilities[annotation.oracle_label_index][index] total_loss = total_loss + loss return total_loss
def __call__(self, encoder_output, hsz, beam_width=1): h_i = self.get_state(encoder_output) context = encoder_output.output if beam_width > 1: # To vectorize, we need to expand along the batch dimension, K times context = [dy.concatenate_to_batch([c] * beam_width) for c in context] h_i = [dy.concatenate_to_batch([h] * beam_width) for h in h_i] _, batchsz = context[0].dim() init_zeros = dy.zeros((hsz,), batch_size=batchsz) return h_i, init_zeros, context
def __call__(self, inputs, is_train=True): """ forward pass :param inputs: input word embeddings :return: """ seq_len = len(inputs) # hm0 hm = dy.zeros((self.n_steps+1, self.n_out)) # h_tilde_0 h_history = dy.zeros((self.n_out,)) # list of hidden states H = [] for i in range(seq_len): xt = inputs[i] hm, h_history = self.recurrence(xt, hm, h_history, dropout_flag=is_train) ht = hm[-1] H.append(ht) return H
def __call__(self, inputs, is_train=True): """ input word embeddings :param inputs: :return: a list of hidden states for aspect predictions """ seq_len = len(inputs) # hm0 and cm0 hm = dy.zeros((self.n_steps, self.n_out)) cm = dy.zeros((self.n_steps, self.n_out)) h_tilde = dy.zeros((self.n_out,)) # list of hidden states H = [] for i in range(seq_len): xt = inputs[i] hm, cm, h_tilde = self.recurrence(xt, hm, cm, h_tilde, dropout_flag=is_train) ht = hm[-1] H.append(ht) return H
def get_state(self, encoder_outputs): final_state = encoder_outputs.hidden shape, batchsz = final_state[0].dim() return [dy.zeros(shape, batch_size=batchsz) for _ in len(final_state)]