def encode_pt(self, X, train=False): dy.renew_cg() w_pos = dy.parameter(self.w_pos) b_pos = dy.parameter(self.b_pos) ipts = [] length = len(X[0]) for i in range(length): cids = X[0][i] wid = X[1][i] tids = X[2][i] vec_char = self.char_seq_model.transduce([self.UNI[cid] for cid in cids])[-1] vec_tags = [] for tid in tids: if tid == 0: zero = dy.inputVector(np.zeros(self.dim_tag_emb)) vec_tags.append(zero) else: vec_tags.append(self.POS[tid]) vec_tag = dy.esum(vec_tags) if wid == 0: vec_word = dy.inputVector(np.zeros(self.dim_word)) else: vec_word = self.WORD[wid] vec_at_i = dy.concatenate([vec_word, vec_char, vec_tag]) if train is True: vec_at_i = dy.dropout(vec_at_i, self.dropout_rate) ipts.append(vec_at_i) hiddens = self.pos_model.transduce(ipts) probs = [dy.softmax(w_pos*h+b_pos) for h in hiddens] return probs
def forward_backward(self, observations): init_alphas = [0, 0] forward_mess = dy.inputVector(init_alphas) alpha = [] for i in range(len(observations) - 1): alphas_t = [] for next_tag in range(2): obs_broadcast = dy.concatenate( [dy.pick(observations[i], next_tag)] * 2) next_tag_expr = forward_mess + self.transitions[ next_tag] + obs_broadcast alphas_t.append(self.log_sum_exp(next_tag_expr)) forward_mess = dy.concatenate(alphas_t) alpha.append(forward_mess) init_betas = [0, 0] backward_mess = dy.inputVector(init_betas) beta = [] for i in range(len(observations) - 1): beta_t = [] for next_tag in range(2): obs = observations[len(observations) - i - 1] next_tag_expr = backward_mess + self.transitions[next_tag] + obs beta_t.append(self.log_sum_exp(next_tag_expr)) backward_mess = dy.concatenate(beta_t) beta.append(backward_mess) mu = [x + y for x, y in zip(alpha, beta[::-1])] # compute marginal probablities prob = [dy.pick(dy.softmax(w), 1) for w in mu] return prob
def scorer(self, q_d_hists, q_idf, bm25_score, overlap_features, p): """ Makes all the calculations and returns a relevance score """ idf_vec = dy.inputVector(q_idf) bm25_score = dy.scalarInput(bm25_score) overlap_features = dy.inputVector(overlap_features) # Pass each query term representation through the MLP term_scores = [] for hist in q_d_hists: q_d_hist = dy.reshape(dy.inputVector(hist), (1, len(hist))) hidd_out = dy.rectify(q_d_hist * self.W_1 + self.b_1) for i in range(0, self.mlp_layers): hidd_out = dy.rectify(hidd_out * self.W_n[i] + self.b_n[i]) term_scores.append(hidd_out * self.W_last + self.b_last) # Term Gating gating_weights = idf_vec * self.w_g bm25_feature = bm25_score * self.W_bm25 + self.b_bm25 drop_out = dy.scalarInput(1) drop_num = (np.random.rand(1) < p)/p #p= probability of keeping a unit active drop_out.set(drop_num) bm25_feature *= drop_out drmm_score = dy.transpose(dy.concatenate(term_scores)) * dy.reshape(gating_weights, (len(q_idf), 1)) #basic MLPs output doc_score = dy.transpose(dy.concatenate([drmm_score, overlap_features])) * self.W_scores + self.b_scores #extra features layer return doc_score
def transform(self, input_expr: dy.Expression, mask: Optional[batchers.Mask]=None): """ Apply batch norm. Args: input_expr: input mask: compute statistics only over unmasked parts of the input expression """ dim_in = input_expr.dim() param_bn_gamma = dy.parameter(self.gamma) param_bn_beta = dy.parameter(self.beta) if self.train: num_unmasked = 0 if mask is not None: input_expr = set_masked_to_mean(mask, input_expr, self.time_first) num_unmasked = (mask.np_arr.size - np.count_nonzero(mask.np_arr)) * broadcast_factor(mask, input_expr) bn_mean = dy.moment_dim(input_expr, self.get_stat_dimensions(), 1, True, num_unmasked) neg_bn_mean_reshaped = -dy.reshape(-bn_mean, self.get_normalizer_dimensionality()) self.population_running_mean += (-BN_MOMENTUM) * self.population_running_mean + BN_MOMENTUM * bn_mean.npvalue() bn_std = dy.std_dim(input_expr, self.get_stat_dimensions(), True, num_unmasked) self.population_running_std += (-BN_MOMENTUM) * self.population_running_std + BN_MOMENTUM * bn_std.npvalue() else: neg_bn_mean_reshaped = -dy.reshape(dy.inputVector(self.population_running_mean), self.get_normalizer_dimensionality()) bn_std = dy.inputVector(self.population_running_std) bn_numerator = input_expr + neg_bn_mean_reshaped bn_xhat = dy.cdiv(bn_numerator, dy.reshape(bn_std, self.get_normalizer_dimensionality()) + BN_EPS) bn_y = dy.cmult(param_bn_gamma, bn_xhat) + param_bn_beta # y = gamma * xhat + beta dim_out = bn_y.dim() self.save_processed_arg("population_running_mean", self.population_running_mean) self.save_processed_arg("population_running_std", self.population_running_std) assert dim_out == dim_in return bn_y
def _upsample(self, mgc, start, stop): mgc_index = int(start / len(self.upsample_w_t)) ups_index = start % len(self.upsample_w_t) upsampled = [] mgc_index_next = mgc_index + 1 if mgc_index_next == len(mgc): mgc_index_next -= 1 mgc_vect = dy.concatenate([dy.inputVector(mgc[mgc_index]), dy.inputVector(mgc[mgc_index_next])]) for x in range(stop - start): # sigm = dy.logistic(self.upsample_w_s[ups_index].expr(update=True) * mgc_vect + self.upsample_b_s[ups_index].expr(update=True)) tnh = dy.tanh(self.upsample_w_t[ups_index].expr(update=True) * mgc_vect + self.upsample_b_t[ups_index].expr( update=True)) # r = dy.cmult(sigm, tnh) upsampled.append(tnh) ups_index += 1 if ups_index == len(self.upsample_w_t): ups_index = 0 mgc_index += 1 if mgc_index == len( mgc): # last frame is sometimes not processed, but it should have similar parameters mgc_index -= 1 else: mgc_index_next = mgc_index + 1 if mgc_index_next == len(mgc): mgc_index_next -= 1 mgc_vect = dy.concatenate([dy.inputVector(mgc[mgc_index]), dy.inputVector(mgc[mgc_index_next])]) return upsampled
def train(self, epoch): #train process of the neural network history = [] for i in range(epoch): start = time.time() total_loss = 0 print('Epoch ' + str(i + 1) + ':') for batch in self.minibathces: dy.renew_cg() losses = [] for word in batch: x = self.training_data[word][0] y = self.training_data[word][1] dy_x = dy.inputVector(x) dy_y = dy.inputVector(y) output = self.feedForward(dy_x) l = self.calculateLoss(output, dy_y) losses.append(l) loss = dy.esum(losses) / len(losses) total_loss += loss.value() loss.backward() self.trainer.update() end = time.time() print('Loss = {0}\nTime it takes = {1} minutes.'.format( total_loss / len(self.minibathces), (end - start) / 60)) history.append(total_loss / len(self.minibathces)) return history
def loss(self, instance): trans = instance.transformation #trans = 'lol' if trans not in self.known_transformations: newtrans = list(self.param_dict.keys())[0][0] ### SUPER ARBITRARY tqdm.write( "WARNING: unknown transformtion picked for instance {}; using transformation {}" .format(trans, newtrans)) trans = newtrans b1 = dy.parameter(self.param_dict[(trans, 'b1')]) W1 = dy.parameter(self.param_dict[(trans, 'W1')]) b2 = dy.parameter(self.param_dict[(trans, 'b2')]) W2 = dy.parameter(self.param_dict[(trans, 'W2')]) #b3 = dy.parameter(self.param_dict[(trans, 'b3')]) #W3 = dy.parameter(self.param_dict[(trans, 'W3')]) #b = dy.parameter(self.param_dict[(trans, 'b')]) #W = dy.parameter(self.param_dict[(trans, 'W')]) x = dy.inputVector(instance.xs_distr_vec) y = dy.inputVector(instance.ys_distr_vec) #prediction = dy.affine_transform([b, W, x]) prediction = dy.affine_transform( [b2, W2, dy.tanh(dy.affine_transform([b1, W1, x]))]) #prediction = dy.affine_transform( # [b3, W3, dy.tanh(dy.affine_transform( # [b2, W2, dy.tanh(dy.affine_transform([b1, W1, x])) ] ))]) loss = dy.squared_distance(prediction, y) return prediction, loss
def encode(self, sent, train_mode=False): # encode the root # sent.root.vecs['feat'] = self.special[0] for token in sent.get_tokens(): vecs = [] if 'word' in self.args.features: if train_mode and np.random.random( ) < 0.01 and token['word'] in self.word_drop_list: word_idx = int(token['word'][0].isupper()) else: word_idx = self.word_map.get( token['word'], int(token['word'][0].isupper())) word_vec = self.word_emb[ word_idx] if word_idx else dy.inputVector( np.zeros(self.args.hid_dim)) vecs.append(word_vec) if 'lemma' in self.args.features: lemma_idx = 0 if train_mode and np.random.random() < 0.1 and token['lemma'] in self.lemma_drop_list\ else self.lemma_map.get(token['lemma'], 0) lemma_vec = self.lemma_emb[ lemma_idx] if lemma_idx else dy.inputVector( np.zeros(self.args.hid_dim)) vecs.append(lemma_vec) if 'upos' in self.args.features: upos_vec = self.upos_emb[self.upos_map.get(token['upos'], 0)] vecs.append(upos_vec) if 'xpos' in self.args.features: vecs.append(self.xpos_emb[self.xpos_map.get(token['xpos'], 0)]) if 'label' in self.args.features: vecs.append(self.label_emb[self.label_map.get( token['label'], 0)]) if 'char_lstm' in self.args.features: char_vecs = [ self.char_emb[self.char_map.get(c, 0)] for c in token['clemma'] ] f_vecs = self.char_lstm_f_encoder.initial_state().transduce( char_vecs) b_vecs = self.char_lstm_b_encoder.initial_state().transduce( reversed(char_vecs)) char_vec = dy.concatenate([f_vecs[-1], b_vecs[-1]]) vecs.append(char_vec) # if 'morph' in self.args.features and 'lemma' in self.args.features and 'upos' in self.args.features: if 'morph' in self.args.features: morph_items = ([token['upos']] if 'upos' in self.args.features else ['<#m?>']) + token['morph'] morph_input = [ self.morph_emb[self.morph_map.get(m, 0)] for m in morph_items ] morph_vec = self.morph_lstm_encoder.initial_state().transduce( morph_input)[-1] vecs.append(morph_vec) # token.vecs['feat'] = dy.concatenate(vecs) # token.vecs['feat'] = sum(vecs) token.vecs['feat'] = dy.dropout( sum(vecs), self.args.dropout) if train_mode else sum(vecs)
def compute_embeddings(self, word, runtime=True): x_list = [] if not isinstance(word, unicode): uniword = unicode(word, 'utf-8') else: import copy uniword = copy.deepcopy(word) uniword = re.sub('\d', '0', uniword) for i in range(len(uniword)): char = uniword[i] if char.lower() == char and char.upper() == char: style_emb = dy.inputVector([1.0, 0.0, 0.0]) # does not support uppercase elif char.lower() == char: style_emb = dy.inputVector([0.0, 1.0, 0.0]) # is lowercased else: style_emb = dy.inputVector([0.0, 0.0, 1.0]) # is uppercased char = char.lower() if char in self.encodings.char2int: x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int[char]], style_emb])) else: x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int['<UNK>']], style_emb])) rnn_outputs = x_list rnn_states_fw = None rnn_states_bw = None for rnn_fw, rnn_bw in zip(self.rnn_fw, self.rnn_bw): fw = [] bw = [] if runtime: rnn_fw.set_dropouts(0, 0) rnn_bw.set_dropouts(0, 0) else: rnn_fw.set_dropouts(0, 0.33) rnn_bw.set_dropouts(0, 0.33) rnn_fw = rnn_fw.initial_state() rnn_bw = rnn_bw.initial_state() rnn_states_fw = [] rnn_states_bw = [] for x in rnn_outputs: rnn_fw = rnn_fw.add_input(x) rnn_states_fw.append(rnn_fw) fw.append(rnn_states_fw[-1].output()) for x in reversed(rnn_outputs): rnn_bw = rnn_bw.add_input(x) rnn_states_bw.append(rnn_bw) bw.append(rnn_states_bw[-1].output()) rnn_outputs = [] for x1, x2 in zip(fw, reversed(bw)): rnn_outputs.append(dy.concatenate([x1, x2])) attention = self._attend(rnn_outputs, rnn_states_fw[-1], rnn_states_bw[-1]) pre_linear = dy.concatenate([fw[-1], bw[-1], attention]) embedding = dy.tanh(self.linearW.expr() * pre_linear + self.linearB.expr()) return embedding, rnn_outputs
def do_one_batch(X_batch, Z_batch): # Flatten the batch into 1-D vector for workaround batch_size = X_batch.shape[0] if DO_BATCH: X_batch_f = X_batch.flatten('F') Z_batch_f = Z_batch.flatten('F') x = dy.reshape(dy.inputVector(X_batch_f), (nmf, nframes), batch_size=batch_size) z = dy.reshape(dy.inputVector(Z_batch_f), (nvgg), batch_size=batch_size) scnn.add_input([X_batch[i] for i in range(X_batch.shape[0])]) vgg.add_input([Z_batch[i] for i in range(X_batch.shape[0])]) else: x = dy.matInput(X_batch.shape[0], X_batch.shape[1]) x.set(X_batch.flatten('F')) z = dy.vecInput(Z_batch.shape[0]) z.set(Z_batch.flatten('F')) x = dy.reshape(dy.transpose(x, [1, 0]), (1, X_batch.shape[1], X_batch.shape[0])) print(x.npvalue().shape) a_h1 = dy.conv2d_bias(x, w_i, b_i, [1, 1], is_valid=False) h1 = dy.rectify(a_h1) h1_pool = dy.kmax_pooling(h1, D[1], d=1) a_h2 = dy.conv2d_bias(h1_pool, w_h1, b_h1, [1, 1], is_valid=False) h2 = dy.rectify(a_h2) h2_pool = dy.kmax_pooling(h2, D[2], d=1) a_h3 = dy.conv2d_bias(h2_pool, w_h2, b_h2, [1, 1], is_valid=False) h3 = dy.rectify(a_h3) h3_pool = dy.kmax_pooling(h3, D[3], d=1) h4 = dy.kmax_pooling(h3_pool, 1, d=1) h4_re = dy.reshape(h4, (J[3], )) #print(h4_re.npvalue().shape) g = dy.scalarInput(1.) zem_sp = dy.weight_norm(h4_re, g) #print(zem_sp.npvalue().shape) zem_vgg = w_embed * z + b_embed #print(zem_vgg.npvalue().shape) sa = dy.transpose(zem_sp) * zem_vgg s = dy.rectify(sa) if PRINT_EMBED: print('Vgg embedding vector:', zem_vgg.npvalue().shape) print(zem_vgg.value()) print('Speech embedding vector:', zem_sp.npvalue().shape) print(zem_sp.value()) if PRINT_SIM: print('Raw Similarity:', sa.npvalue()) print(sa.value()) print('Similarity:', s.npvalue()) print(s.value()) return s
def learn(self, wave, mgc, batch_size): # disc, wave = self.dio.ulaw_encode(wave) # from ipdb import set_trace # set_trace() last_proc = 0 dy.renew_cg() total_loss = 0 losses = [] cnt = 0 noise = np.random.normal(0, 1.0, (len(wave) + self.UPSAMPLE_COUNT)) for mgc_index in range(len(mgc)): curr_proc = int((mgc_index + 1) * 100 / len(mgc)) if curr_proc % 5 == 0 and curr_proc != last_proc: while last_proc < curr_proc: last_proc += 5 sys.stdout.write(' ' + str(last_proc)) sys.stdout.flush() if mgc_index < len(mgc) - 1: output, excitation, filter, vuv = self._predict_one(mgc[mgc_index], noise[ self.UPSAMPLE_COUNT * mgc_index:self.UPSAMPLE_COUNT * mgc_index + 2 * self.UPSAMPLE_COUNT]) # reconstruction error t_vect = wave[self.UPSAMPLE_COUNT * mgc_index:self.UPSAMPLE_COUNT * mgc_index + self.UPSAMPLE_COUNT] loss = dy.squared_distance(output, dy.inputVector(t_vect)) # dynamic error o1 = dy.pickrange(output, 0, self.UPSAMPLE_COUNT - 1) o2 = dy.pickrange(output, 1, self.UPSAMPLE_COUNT) delta = o2 - o1 real_delta = t_vect[1:self.UPSAMPLE_COUNT] - t_vect[0:self.UPSAMPLE_COUNT - 1] loss += dy.squared_distance(delta, dy.inputVector(real_delta)) # excitation error # loss += dy.sum_elems(excitation) # o1 = dy.pickrange(excitation, 0, self.UPSAMPLE_COUNT - 1) # o2 = dy.pickrange(excitation, 1, self.UPSAMPLE_COUNT) # loss += dy.sum_elems(dy.abs(o2 - o1)) losses.append(loss) cnt += self.UPSAMPLE_COUNT if len(losses) >= batch_size: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() losses = [] dy.renew_cg() if len(losses) > 0: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() dy.renew_cg() return total_loss / cnt
def forward(self, input, enc_output, teacher_forcing_ratio): seq_len = len(input) bos_vector = [0.] * self.dec_vocab_size bos_vector[2] = 1. output = [dy.inputVector(bos_vector)] attention_weights = [] rnn = self.rnn.initial_state([ dy.inputVector(np.zeros(self.dec_hidden_dim)) for i in range(2 * self.dec_num_layers) ]) #print("Start forward loop:") context, _ = self._attention(rnn.s(), enc_output) #context = enc_output[-1] # input is a list of ints, starting with 2 "[BOS]" 2 4 5 3 for i in range( 0, seq_len - 1 ): # we stop when we feed the decoder the [EOS] and we take its output (thus the -1) # calculate the context vector at step i. # context is [encoder_size], attention_weights is [seq_len] # todo context, step_attention_weights = self._attention( rnn.s(), enc_output) #context = enc_output[-1] step_attention_weights = [] # save attention weights incrementally #attention_weights.append(step_attention_weights) #if np.random.uniform(0, 1) < teacher_forcing_ratio or i is 0: word_embedding = dy.dropout(self.embedding[input[i]], self.dec_dropout) """else: #prev_predicted_word_index = np.argmax(lin_output.value()) #index_vector = dy.inputVector(np.arange(self.dec_vocab_size)) argmax = dy.argmax(lin_output, gradient_mode='zero_gradient') prev_embedding = dy.dropout(self.embedding*argmax, self.dec_dropout) #prev_predicted_word_index = dy.sum_elems(dy.cmult(index_vector,dy.argmax(lin_output, gradient_mode='zero_gradient'))) #print(prev_predicted_word_index.value()) #word_embedding = dy.dropout(self.embedding[prev_predicted_word_index], self.dec_dropout) """ lstm_input = dy.concatenate([word_embedding, context]) rnn = rnn.add_input(lstm_input) #print("rnn.s has {} vectors of length {}".format(len(rnn.s()), len(rnn.s()[0].value()))) dec_output = rnn.output() # Maps the decoder output to the decoder vocab size space. lin_output = self.output_linear_W.expr( update=True) * dec_output + self.output_linear_b.expr( update=True) output.append(lin_output) #print("Step {} predicted index = {}".format(i,np.argmax(lin_output.value()))) return output, attention_weights
def __init__(self,cs=None,hs=None,full_vec=None,hidden_dim=None): if not None in [full_vec,hidden_dim]: length = int(len(full_vec)/2) cvec = full_vec[:length] hvec = full_vec[length:] self.cs = [dy.inputVector(cvec[i*hidden_dim:(i+1)*hidden_dim]) for i in range(int(length/hidden_dim))] self.hs = [dy.inputVector(hvec[i*hidden_dim:(i+1)*hidden_dim]) for i in range(int(length/hidden_dim))] elif not None in [cs,hs]: self.cs = cs #list of c expressions self.hs = hs #list of h expressions else: raise MissingInput()
def attend(self, node): '''attention mechanism to return a weighted sum of bilstm vectors of all words in node ''' if node.snt_id == -1: # if node is a pre-defined meta node return self.bi_lstm[node.start_word_index_in_doc] #print(node.start_word_index_in_doc, node.end_word_index_in_doc) vectors = self.bi_lstm[node.start_word_index_in_doc:node. end_word_index_in_doc + 1] # build attention on a larger context # +/- n words around the current timex/event # n = 2 if node.start_word_index_in_doc <= 0: vectors.insert( 0, dy.inputVector([0 for i in range(self.size_lstm * 2)])) else: vectors.insert(0, self.bi_lstm[node.start_word_index_in_doc - 1]) if node.start_word_index_in_doc <= 1: vectors.insert( 0, dy.inputVector([0 for i in range(self.size_lstm * 2)])) else: vectors.insert(0, self.bi_lstm[node.start_word_index_in_doc - 2]) if node.end_word_index_in_doc >= len(self.bi_lstm) - 1: vectors.append( dy.inputVector([0 for i in range(self.size_lstm * 2)])) else: vectors.append(self.bi_lstm[node.end_word_index_in_doc + 1]) if node.end_word_index_in_doc >= len(self.bi_lstm) - 2: vectors.append( dy.inputVector([0 for i in range(self.size_lstm * 2)])) #print(node.start_word_index_in_doc, node.end_word_index_in_doc) #print(len(self.bi_lstm)) #print(self.size_lstm) else: vectors.append(self.bi_lstm[node.end_word_index_in_doc + 2]) input_mat = dy.concatenate_cols(vectors) attn_w = dy.parameter(self.attention_w) unnormalized = dy.transpose(dy.tanh(attn_w * input_mat)) att_weights = dy.softmax(unnormalized) weighted_sum = input_mat * att_weights return weighted_sum
def BuildLMGraph(self, sent, sent_args=None): dynet.renew_cg() init_state = self.rnn.initial_state() R = dynet.parameter(self.R) bias = dynet.parameter(self.bias) errs = [] # will hold expressions state = init_state for (cw, nw) in zip(sent, sent[1:]): cw = self.vocab[cw] nw = self.vocab[nw] if cw.s in self.pron_dict.pdict: fpv = self.pron_dict.pdict[cw.s] fpv = dynet.inputVector(fpv) else: spelling = [ self.s2s.src_vocab[letter] for letter in cw.s.upper() ] embedded_spelling = self.s2s.embed_seq(spelling) pron_vector = self.s2s.encode_seq(embedded_spelling)[-1] fpv = dynet.nobackprop(pron_vector) x_t = fpv state = state.add_input(x_t) y_t = state.output() r_t = bias + (R * y_t) err = dynet.pickneglogsoftmax(r_t, int(nw.i)) errs.append(err) nerr = dynet.esum(errs) return nerr
def viterbi(self, observations): backpointers = [] init_pis = [0, 0] forward_mess = dy.inputVector(init_pis) transitions = [self.transitions[idx] for idx in range(2)] for i in range(len(observations) - 1): bp_t = [] pi_t = [] for next_tag in range(2): next_tag_expr = forward_mess + transitions[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bp_t.append(best_tag_id) pi_t.append(dy.pick(next_tag_expr, best_tag_id)) forward_mess = dy.concatenate(pi_t) + observations[i] backpointers.append(bp_t) # find the highrst scoring final state and the corresponding score best_tag_id = np.argmax(forward_mess.npvalue()) path_score = dy.pick(forward_mess, best_tag_id) # backtracking best_path = [best_tag_id] for bp_t in reversed(backpointers): best_tag_id = bp_t[best_tag_id] best_path.append(best_tag_id) best_path.pop() best_path.reverse() return best_path, path_score
def zero_input(dim): """ Representation for missing elements :param dim: dimension of vector to return :return: zero vector (as in e.g. Kiperwasser and Goldberg 2016; an alternative could be to learn this value) """ return dy.inputVector(np.zeros(dim, dtype=float))
def forward(self, observations): def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dy.pick(scores, argmax_score) max_score_expr_broadcast = dy.concatenate([max_score_expr] * self.tagset_size) return max_score_expr + dy.log( dy.sum_cols( dy.transpose(dy.exp(scores - max_score_expr_broadcast)))) init_alphas = [-1e10] * self.tagset_size init_alphas[t2i[START_TAG]] = 0 for_expr = dy.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.tagset_size): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.tagset_size) next_tag_expr = for_expr + self.transitions[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dy.concatenate(alphas_t) terminal_expr = for_expr + self.transitions[t2i["<STOP>"]] alpha = log_sum_exp(terminal_expr) return alpha
def augment(scores, oracle_index): assert isinstance(scores, dy.Expression) shape = scores.dim()[0] assert len(shape) == 1 increment = np.ones(shape) increment[oracle_index] = 0 return scores + dy.inputVector(increment)
def predict(self, sentence): context_representations_for_ner_loss, context_representations_for_md_loss = \ self.get_context_representations(sentence, training=False) last_layer_context_representations, _, _ = \ self.get_last_layer_context_representations(sentence, context_representations_for_ner_loss, context_representations_for_md_loss) if self.parameters['active_models'] in [0, 2, 3]: tag_scores = self.calculate_tag_scores( last_layer_context_representations) # _, decoded_tags = self.crf_module.viterbi_loss(tag_scores, # sentence['tag_ids']) observations = [ dynet.concatenate([obs, dynet.inputVector([-1e10, -1e10])], d=0) for obs in tag_scores ] decoded_tags, _ = self.crf_module.viterbi_decoding(observations) else: decoded_tags = [] # if self.parameters['integration_mode'] in [1, 2] or self.parameters['active_models'] == 1: if self.parameters['active_models'] in [1, 2, 3]: morph_analysis_representations, morph_analysis_scores = \ self.get_morph_analysis_representations_and_scores(sentence, context_representations_for_md_loss) selected_morph_analysis_representations = \ self.disambiguate_morph_analyzes(morph_analysis_scores) else: selected_morph_analysis_representations = [] return selected_morph_analysis_representations, decoded_tags
def viterbi_decoding(self, observations): backpointers = [] init_vvars = [-1e10] * (self.n_tags + 2) init_vvars[self.b_id] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.transitions[idx] for idx in range(self.n_tags + 2)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.n_tags + 2): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[self.e_id] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == self.b_id # Return best path and best path's score return best_path, path_score
def forward(self, observations): def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dy.pick(scores, argmax_score) max_score_expr_broadcast = dy.concatenate([max_score_expr] * self.dim_output) return max_score_expr + dy.log( dy.sum_elems( dy.transpose(dy.exp(scores - max_score_expr_broadcast)))) init_alphas = [-1e10] * self.dim_output init_alphas[self.sp_s] = 0 for_expr = dy.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.dim_output): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.dim_output) next_tag_expr = for_expr + self.trans[next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dy.concatenate(alphas_t) terminal_expr = for_expr + self.trans[self.sp_e] alpha = log_sum_exp(terminal_expr) return alpha
def _forward(self, emissions): """Viterbi forward to calculate all path scores. :param emissions: List[dy.Expression] Returns: dy.Expression ((1,), B) """ init_alphas = [-1e4] * self.n_tags init_alphas[self.start_idx] = 0 alphas = dy.inputVector(init_alphas) transitions = self.transitions # len(emissions) == T for emission in emissions: add_emission = dy.colwise_add(transitions, emission) scores = dy.colwise_add(dy.transpose(add_emission), alphas) # dy.logsumexp takes a list of dy.Expression and computes logsumexp # elementwise across the lists so for example the logsumexp is calculated # for [0] in each list. This means we want the scores for a given # transition scores for a tag to be in the columns alphas = dy.logsumexp([x for x in scores]) last_alpha = alphas + dy.pick(transitions, self.end_idx) alpha = dy.logsumexp([x for x in last_alpha]) return alpha
def viterbi_decoding(self, observations): backpointers = [] init_vvars = [-1e10] * self.dim_output init_vvars[self.sp_s] = 0 for_expr = dy.inputVector(init_vvars) trans_exprs = [self.trans[idx] for idx in range(self.dim_output)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.dim_output): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) for_expr = dy.concatenate(vvars_t) + obs backpointers.append(bptrs_t) terminal_expr = for_expr + trans_exprs[self.sp_e] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dy.pick(terminal_expr, best_tag_id) best_path = [best_tag_id] for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() best_path.reverse() assert start == self.sp_s return best_path, path_score
def forward(self, observations): def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * (self.n_tags + 2)) return max_score_expr + dynet.log( dynet.sum_cols( dynet.transpose( dynet.exp(scores - max_score_expr_broadcast)))) init_alphas = [-1e10] * (self.n_tags + 2) init_alphas[self.b_id] = 0 for_expr = dynet.inputVector(init_alphas) for idx, obs in enumerate(observations): # print "obs: ", obs.value() alphas_t = [] for next_tag in range(self.n_tags + 2): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * (self.n_tags + 2)) # print "for_expr: ", for_expr.value() # print "transitions next_tag: ", self.transitions[next_tag].value() # print "obs_broadcast: ", obs_broadcast.value() next_tag_expr = for_expr + self.transitions[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.transitions[self.e_id] alpha = log_sum_exp(terminal_expr) return alpha
def create_poems(one_hot_vecs,bigram_model,indexed_vocab, line_number): model = dy.Model() dy.renew_cg() vector_size = len(one_hot_vecs) input_size = vector_size hidden_size = int(vector_size / 100) output_size = vector_size learning_rate = 0.1 pW = model.add_parameters((hidden_size, output_size)) pb = model.add_parameters((hidden_size)) pU = model.add_parameters((input_size, hidden_size)) pd = model.add_parameters((output_size)) trainer = dy.SimpleSGDTrainer(model, learning_rate) a= model.populate("my_train.model") # print(a) for line in range(int(line_number)): rand = random.randrange(len(one_hot_vecs)) predicted_word = "" poem = "" while predicted_word != "EOL": x = dy.inputVector(one_hot_vecs[rand]) y = pU * dy.tanh(pW * x + pb) + pd rand = np.argmax(y.value()) predicted_word = getWordFromIndexedVocab(indexed_vocab, np.argmax(y.value())) if predicted_word == "EOL": print() poem = poem + "\n" else: print(predicted_word,end = ' ') poem = poem + predicted_word + " " print("POEM PERPLEXITY", BigramModel.calculate_bigram_perplexity(bigram_model,poem))
def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda:{})): if self.elmo: # Get full text of sentence - excluding root, which is loaded differently # for transition and graph-based parsers. if options.graph_based: sentence_text = " ".join([entry.form for entry in sentence[1:]]) else: sentence_text = " ".join([entry.form for entry in sentence[:-1]]) elmo_sentence_representation = \ self.elmo.get_sentence_representation(sentence_text) for i, root in enumerate(sentence): root.vecs = defaultdict(lambda: None) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count/(0.25+word_count) root.vecs["word"] = self.word_lookup[self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector(test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos,0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector(root,train,test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[self.treebanks[treebank_id]] if self.elmo: if i < len(sentence) - 1: # Don't look up the 'root' word root.vecs["elmo"] = elmo_sentence_representation[i] else: # TODO root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim) root.vec = dy.concatenate(list(filter(None, [root.vecs["word"], root.vecs["elmo"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"]]))) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence,train)
def pz(self, eq): """ Gumbel softmax on distribution over z. """ W = dy.parameter(self.W) prob = dy.softmax(W * eq) gumbel = dy.random_gumbel(self.num_clusters) # y = [] # denom = [] # for z in range(self.num_clusters): # pi_i = prob[z] # g_i = gumbel[z] # val = dy.exp((dy.log(pi_i)+g_i)/self.temp) # denom.append(val) # denom = dy.esum(denom) # for z in range(self.num_clusters): # pi_i = prob[z] # g_i = gumbel[z] # numerator = dy.exp((dy.log(pi_i)+g_i)/self.temp) # y.append(dy.cdiv(numerator, denom)) logits = dy.softmax( dy.cdiv(dy.esum([prob, gumbel]), dy.inputVector([self.temp]))) # logits = dy.concatenate(y) # print(np.max(logits.npvalue())) return logits
def _attend(self, input_list, decoder_state, last_pos=None): w1 = self.att_w1.expr(update=True) w2 = self.att_w2.expr(update=True) v = self.att_v.expr(update=True) attention_weights = [] w2dt = w2 * dy.concatenate([decoder_state.s()[-1]]) for input_vector in input_list: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) # force incremental attention if this is runtime if last_pos is not None: current_pos = np.argmax(attention_weights.value()) if current_pos < last_pos or current_pos >= last_pos + 3: current_pos = last_pos + 1 if current_pos >= len(input_list): current_pos = len(input_list) - 1 output_vectors = input_list[current_pos] simulated_att = np.zeros((len(input_list))) simulated_att[current_pos] = 1.0 new_att_vec = dy.inputVector(simulated_att) return output_vectors, new_att_vec output_vectors = dy.esum([ vector * attention_weight for vector, attention_weight in zip(input_list, attention_weights) ]) return output_vectors, attention_weights
def learn(self, characters, target_mgc, guided_att=True): num_mgc = target_mgc.shape[0] # print num_mgc dy.renew_cg() output_mgc, output_stop, output_attention = self._predict( characters, target_mgc) losses = [] index = 0 for mgc, real_mgc in zip(output_mgc, target_mgc): t_mgc = dy.inputVector(real_mgc) # losses.append(self._compute_binary_divergence(mgc, t_mgc) ) losses.append(dy.l1_distance(mgc, t_mgc)) if index % 3 == 0: # attention loss if guided_att: att = output_attention[index / 3] losses.append( self._compute_guided_attention(att, index / 3, len(characters) + 2, num_mgc / 3)) # EOS loss stop = output_stop[index / 3] if index >= num_mgc - 6: losses.append(dy.l1_distance(stop, dy.scalarInput(-0.8))) else: losses.append(dy.l1_distance(stop, dy.scalarInput(0.8))) index += 1 loss = dy.esum(losses) loss_val = loss.value() / num_mgc loss.backward() self.trainer.update() return loss_val
def viterbi(emissions, transition, start_idx, end_idx, norm=False): n_tags = emissions[0].dim()[0][0] backpointers = [] inits = [-1e4] * n_tags inits[start_idx] = 0 alphas = dy.inputVector(inits) alphas = dy.log_softmax(alphas) if norm else alphas for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transition), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transition, end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def generate_poem(): start = '<s>' poem = '' wordflag = 0 for i in range(25): dy.renew_cg() W = dy.parameter(pW) b = dy.parameter(pb) U = dy.parameter(pU) d = dy.parameter(pd) x_val = dy.inputVector(list(one_hot_encoded[word_index[start]])) h_val = dy.tanh(W * x_val + b) y_val = U * h_val + d probs = dy.softmax(y_val) poem += start poem += ' ' wordflag += 1 if wordflag == 5: poem += '\n' wordflag = 0 start = weightedChoice(probs.value(), unigrams) prob_list.append(probs.__getitem__(word_index[start]).value()) return poem
def ergm_score(self): """ :return: ERGM score (dynet Expression) computed based on ERGM weights and features only Does not populate any field """ W = dy.parameter(self.ergm_weights) f = dy.transpose(dy.inputVector([self.feature_vals[k] for k in self.feature_set])) return f * W
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def init_features(self, embeddings, train=False): """ Set the value of self.input_reps (and self.empty_rep) given embeddings for the whole input sequence :param embeddings: list of [(key, list of vectors embeddings per time step)] per feature :param train: are we training now? """ if self.params: keys, embeddings = zip(*embeddings) inputs = [self.mlp.evaluate(zip(keys, es), train=train) for es in zip(*embeddings)] # join each time step self.config.print("Transducing %d inputs with dropout %s" % (len(inputs), self.dropout if train else "disabled"), level=4) self.input_reps = self.transduce(inputs, train) expected = min(len(inputs), self.max_length or np.iinfo(int).max) assert len(self.input_reps) == expected, \ "transduce() returned incorrect number of elements: %d != %d" % (len(self.input_reps), expected) self.empty_rep = dy.inputVector(np.zeros(self.lstm_layer_dim, dtype=float))
def generate_inputs(self, features, axis): indices = [] # list, not set, in order to maintain consistent order for key, values in sorted(features.items()): param = self.input_params[key] lookup = self.params.get(key) if param.numeric: yield key, dy.inputVector(values) elif param.indexed: # collect indices to be looked up indices += values # DenseFeatureExtractor collapsed features so there are no repetitions between them elif lookup is None: # ignored continue else: # lookup feature yield from ((key, self.get_empty_values(key) if x == MISSING_VALUE else lookup[x]) for x in values) self.config.print(lambda: "%s: %s" % (key, values), level=4) if indices: for birnn in self.get_birnns(axis): yield from birnn.evaluate(indices)
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Step through the sentence and calculate binary prediction losses all_losses = [] for i, my_emb in enumerate(emb): scores = dy.logistic(W_c * my_emb) pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words] word_repr = [dy.inputVector(x) for x in word_repr] all_losses.extend([dy.binary_log_loss(scores, x) for x in word_repr]) return dy.esum(all_losses)
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent)>i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def calc_lm_loss(sents): dy.renew_cg() # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = 0 wids = [] masks = [] for i in range(len(sents[0])): wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) # start the rnn by inputting "<s>" init_ids = [S] * len(sents) s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) # feed word vectors into the RNN and predict the next word losses = [] for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dy.affine_transform([b_exp, W_exp, s.output()]) loss = dy.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN wemb = dy.lookup_batch(WORDS_LOOKUP, wid) s = s.add_input(wemb) return dy.sum_batches(dy.esum(losses)), tot_words
def get_empty_values(self, key): value = self.empty_values.get(key) if value is None: self.empty_values[key] = value = dy.inputVector(np.zeros(self.input_params[key].dim, dtype=float)) return value
alpha = 0.05 # smoothing of training loss for reporting start = time.time() dev_time = 0 report = args.minibatch_size * 30 dev_report = args.minibatch_size * 600 for epoch in range(50): random.shuffle(training) print(("Epoch {} starting".format(epoch+1))) i = 0 while i < len(training): dy.renew_cg() mbsize = min(args.minibatch_size, len(training) - i) minibatch = training[i:i+mbsize] losses = [] for lbl, img in minibatch: x = dy.inputVector(img) logits = classify(x, dropout=True) loss = dy.pickneglogsoftmax(logits, lbl) losses.append(loss) mbloss = dy.esum(losses) / mbsize mbloss.backward() sgd.update() # eloss is an exponentially smoothed loss. if eloss is None: eloss = mbloss.scalar_value() else: eloss = mbloss.scalar_value() * alpha + eloss * (1.0 - alpha) # Do dev evaluation here: if (i > 0) and (i % dev_report == 0):
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words