def test_gradient_sanity(self): dy.renew_cg() x=dy.inputTensor(self.v1) y=dy.inputTensor(self.v2) l = dy.dot_product(x,y) l.forward() self.assertRaises(RuntimeError, gradient_callable, x)
def test_gradient_sanity(self): dy.renew_cg() x = dy.inputTensor(self.v1) y = dy.inputTensor(self.v2) l = dy.dot_product(x, y) l.forward() self.assertRaises(RuntimeError, gradient_callable, x)
def learn(self, batch_size): exps = self.memory.sample(batch_size) obss, actions, rewards, obs_nexts, dones = self._process(exps) # Update critic dy.renew_cg() target_actions = self.actor_target(obs_nexts, batched=True) target_values = self.critic_target(dy.concatenate([dy.inputTensor(obs_nexts, batched=True), target_actions]), batched=True) target_values = rewards + 0.99 * target_values.npvalue() * (1 - dones) dy.renew_cg() values = self.critic(np.concatenate([obss, actions]), batched=True) loss = dy.mean_batches((values - dy.inputTensor(target_values, batched=True)) ** 2) loss_value_critic = loss.npvalue() loss.backward() self.trainer_critic.update() # update actor dy.renew_cg() actions = self.actor(obss, batched=True) obs_and_actions = dy.concatenate([dy.inputTensor(obss, batched=True), actions]) loss = -dy.mean_batches(self.critic(obs_and_actions, batched=True)) loss_value_actor = loss.npvalue() loss.backward() self.trainer_actor.update() self.noise_stddev = ( self.noise_stddev - self.noise_stddev_decrease) if self.noise_stddev > self.noise_stddev_lower else self.noise_stddev_lower self.actor_target.update(self.actor, soft=True) self.critic_target.update(self.critic, soft=True) return loss_value_actor + loss_value_critic
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): # x,y: (input_size x seq_len) x batch_size if bias_x: x = dy.concatenate( [x, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) if bias_y: y = dy.concatenate( [y, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) nx, ny = input_size + bias_x, input_size + bias_y # W: (num_outputs x ny) x nx lin = W * x if num_outputs > 1: lin = dy.reshape(lin, (ny, num_outputs * seq_len), batch_size=batch_size) blin = dy.transpose(y) * lin if num_outputs > 1: blin = dy.reshape(blin, (seq_len, num_outputs, seq_len), batch_size=batch_size) # seq_len_y x seq_len_x if output_size == 1 # seq_len_y x num_outputs x seq_len_x else return blin
def test_gradient(self): dy.renew_cg() x=dy.inputTensor(self.v1) y=dy.inputTensor(self.v2) l = dy.dot_product(x,y) l.forward() l.backward(full=True) self.assertTrue(np.allclose(x.gradient(), self.v2),msg="{}\n{}\n{}".format(l.value(),x.gradient(),self.v2,y.gradient(),self.v2))
def predict(self, input): # Renew the CG #dy.renew_cg() # Apply forward LSTM init_state_fwd = self.fwd_lstm_builder.initial_state() states = init_state_fwd.add_inputs(dy.inputTensor(input)) fwd_vectors = [state.output() for state in states] # Apply reverse LSTM init_state_bwd = self.bwd_lstm_builder.initial_state() input_rev = input[::-1] states = init_state_bwd.add_inputs(dy.inputTensor(input_rev)) bwd_vectors = [state.output() for state in states] bwd_vectors = bwd_vectors[::-1] # Concatenate the vectors lstm_vectors = [dy.concatenate([fwd_out, bwd_out]) for fwd_out, bwd_out in zip(fwd_vectors, bwd_vectors)] if debug: print "The number of bidirectional vectors: ", len(lstm_vectors), " which means I think this is the length of the source sentence" bidirectional_vectors = dy.concatenate_cols(lstm_vectors) if debug: print "The number of bidirectional vectors: ", len(bidirectional_vectors.value()), " which means I think this is the length of the source sentence" # Decoder w_out = dy.parameter(self.w_decoder) b_out = dy.parameter(self.b_decoder) if debug: print "First input to decoder: ", len(dy.concatenate([dy.vecInput(self.num_hidden*2),self.M[0]]).value()) state_decoder = self.decoder_lstm_builder.initial_state().add_input(dy.concatenate([dy.vecInput(self.num_hidden*2), self.M[0]])) last_embeddings = self.M[0] if debug: print "Length of last embeddings: ", len(last_embeddings.value()) output_frames = [] w1 = dy.parameter(self.attention_w1) w1dt = w1 * bidirectional_vectors while True: attended_encoding = self.attend(w1dt, bidirectional_vectors, state_decoder) attention = [k[0].value() for k in attended_encoding] #if debug: # print "Attention output is: ", len(attended_encoding.value()) state_decoder = state_decoder.add_input(dy.concatenate([dy.inputTensor(attention),last_embeddings])) ### Predict the frames now frame_predicted = w_out * state_decoder.output() + b_out frame_predicted = dy.rectify(w_out * state_decoder.output() + b_out) last_embeddings = frame_predicted if debug: print "Length of updated last embeddings is : ", len(last_embeddings.value()) print '\n' output_frames.append(frame_predicted.value()) if last_embeddings == self.M[0] or len(output_frames) > 2* len(input): break return output_frames
def __getitem__(self, key): if self.expr_list or self.expr_tensor: return super().__getitem__(key) else: if batchers.is_batched(self.lazy_data): return dy.inputTensor( [self.lazy_data[batch].get_array()[:, key] for batch in range(self.lazy_data.batch_size())], batched=True) else: return dy.inputTensor(self.lazy_data.get_array()[:,key], batched=False)
def _biaffine(self, x, W, y): x = dy.concatenate( [x, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) y = dy.concatenate( [y, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) nx, ny = self.input_dim + 1, self.input_dim + 1 lin = dy.reshape(W * x, (ny, self.hidden_dim)) blin = dy.transpose(dy.transpose(y) * lin) return blin
def __call__(self, query, options, gold, lengths, query_no): if len(options) == 1: return None, 0 final = [] if args.word_vectors: qvecs = [dy.lookup(self.pEmbedding, w) for w in query] qvec_max = dy.emax(qvecs) qvec_mean = dy.average(qvecs) for otext, features in options: if not args.no_features: inputs = dy.inputTensor(features) if args.word_vectors: ovecs = [dy.lookup(self.pEmbedding, w) for w in otext] ovec_max = dy.emax(ovecs) ovec_mean = dy.average(ovecs) if args.no_features: inputs = dy.concatenate( [qvec_max, qvec_mean, ovec_max, ovec_mean]) else: inputs = dy.concatenate( [inputs, qvec_max, qvec_mean, ovec_max, ovec_mean]) if args.drop > 0: inputs = dy.dropout(inputs, args.drop) h = inputs for pH, pB in zip(self.hidden, self.bias): h = dy.affine_transform([pB, pH, h]) if args.nonlin == "linear": pass elif args.nonlin == "tanh": h = dy.tanh(h) elif args.nonlin == "cube": h = dy.cube(h) elif args.nonlin == "logistic": h = dy.logistic(h) elif args.nonlin == "relu": h = dy.rectify(h) elif args.nonlin == "elu": h = dy.elu(h) elif args.nonlin == "selu": h = dy.selu(h) elif args.nonlin == "softsign": h = dy.softsign(h) elif args.nonlin == "swish": h = dy.cmult(h, dy.logistic(h)) final.append(dy.sum_dim(h, [0])) final = dy.concatenate(final) nll = -dy.log_softmax(final) dense_gold = [] for i in range(len(options)): dense_gold.append(1.0 / len(gold) if i in gold else 0.0) answer = dy.inputTensor(dense_gold) loss = dy.transpose(answer) * nll predicted_link = np.argmax(final.npvalue()) return loss, predicted_link
def test_sanity(self): for i in range(3): dy.renew_cg() nll = self.sm.neg_log_softmax(dy.inputTensor(np.arange(3)), 4, update=True) nll_const = self.sm.neg_log_softmax(dy.inputTensor(np.arange(3)), 5, update=False) nll = self.sm.neg_log_softmax(dy.inputTensor(np.arange(3)), 6, update=True) nll_const = self.sm.neg_log_softmax(dy.inputTensor(np.arange(3)), 7, update=False) nll.value() nll_const.value()
def __getitem__(self, key): if self.expr_list or self.expr_tensor: return super(LazyNumpyExpressionSequence, self).__getitem__(key) else: if xnmt.batcher.is_batched(self.lazy_data): return dy.inputTensor( [self.lazy_data[batch].get_array()[:, key] for batch in range(self.lazy_data.batch_size())], batched=True) else: return dy.inputTensor(self.lazy_data.get_array()[:,key], batched=False)
def generate(self, src, forced_trg_ids): assert not forced_trg_ids assert batchers.is_batched(src) and src.batch_size()==1, "batched generation not fully implemented" src = src[0] # Generating outputs outputs = [] event_trigger.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder.transduce(embeddings) if self.mode in ["avg_mlp", "final_mlp"]: if self.generate_per_step: assert self.mode == "avg_mlp", "final_mlp not supported with generate_per_step=True" scores = [dy.logistic(self.output_layer.transform(enc_i)) for enc_i in encodings] else: if self.mode == "avg_mlp": encoding_fixed_size = dy.sum_dim(encodings.as_tensor(), [1]) * (1.0 / encodings.dim()[0][1]) elif self.mode == "final_mlp": encoding_fixed_size = self.encoder.get_final_states()[-1].main_expr() scores = dy.logistic(self.output_layer.transform(encoding_fixed_size)) elif self.mode == "lin_sum_sig": enc_lin = [] for step_i, enc_i in enumerate(encodings): step_linear = self.output_layer.transform(enc_i) if encodings.mask and np.sum(encodings.mask.np_arr[:, step_i]) > 0: step_linear = dy.cmult(step_linear, dy.inputTensor(1.0 - encodings.mask.np_arr[:, step_i], batched=True)) enc_lin.append(step_linear) if self.generate_per_step: scores = [dy.logistic(enc_i) for enc_i in enc_lin] else: if encodings.mask: encoding_fixed_size = dy.cdiv(dy.esum(enc_lin), dy.inputTensor(np.sum(1.0 - encodings.mask.np_arr, axis=1), batched=True)) else: encoding_fixed_size = dy.esum(enc_lin) / encodings.dim()[0][1] scores = dy.logistic(encoding_fixed_size) else: raise ValueError(f"unknown mode '{self.mode}'") if self.generate_per_step: output_actions = [np.argmax(score_i.npvalue()) for score_i in scores] score = np.sum([np.max(score_i.npvalue()) for score_i in scores]) outputs.append(sent.SimpleSentence(words=output_actions, idx=src.idx, vocab=getattr(self.trg_reader, "vocab", None), score=score, output_procs=self.trg_reader.output_procs)) else: scores_arr = scores.npvalue() output_actions = list(np.nonzero(scores_arr > 0.5)[0]) score = np.sum(scores_arr[scores_arr > 0.5]) outputs.append(sent.SimpleSentence(words=output_actions, idx=src.idx, vocab=getattr(self.trg_reader, "vocab", None), score=score, output_procs=self.trg_reader.output_procs)) return outputs
def apply_linear_embed(self, sent1, sent2): """ :param sent1: np matrix. :param sent2: np matrix. :return: each sentence after projection to hid_dim. """ p_W = dy.parameter(self.linear_embed) sent1 = dy.inputTensor(sent1) * p_W sent2 = dy.inputTensor(sent2) * p_W return sent1, sent2
def add_to_tensor_expr(self, tensor_expr: dy.Expression, multiplicator: Optional[numbers.Real]=None) -> dy.Expression: # TODO: might cache these expressions to save memory if np.count_nonzero(self.np_arr) == 0: return tensor_expr else: if multiplicator is not None: mask_expr = dy.inputTensor(np.expand_dims(self.np_arr.transpose(), axis=1) * multiplicator, batched=True) else: mask_expr = dy.inputTensor(np.expand_dims(self.np_arr.transpose(), axis=1), batched=True) return tensor_expr + mask_expr
def calc_loss( self, model: 'model_base.ConditionedModel', src: Union[sent.Sentence, 'batchers.Batch'], trg: Union[sent.Sentence, 'batchers.Batch']) -> losses.FactoredLossExpr: batch_size = trg.batch_size() uniques = [set() for _ in range(batch_size)] deltas = [] probs = [] sign = -1 if self.inv_eval else 1 search_outputs = model.generate_search_output(src, self.search_strategy) for search_output in search_outputs: assert len(search_output.word_ids) == 1 assert search_output.word_ids[0].shape == (len( search_output.state), ) logprob = [] for word, state in zip(search_output.word_ids[0], search_output.state): lpdist = model.decoder.scorer.calc_log_probs(state.as_vector()) lp = dy.pick(lpdist, word) logprob.append(lp) sample = search_output.word_ids logprob = dy.esum(logprob) * self.alpha # Calculate the evaluation score eval_score = np.zeros(batch_size, dtype=float) mask = np.zeros(batch_size, dtype=float) for j in range(batch_size): ref_j = self.remove_eos(trg[j].words) hyp_j = self.remove_eos(sample[j].tolist()) if self.unique_sample: hash_val = hash(tuple(hyp_j)) if len(hyp_j) == 0 or hash_val in uniques[j]: mask[j] = -1e20 # represents negative infinity continue else: uniques[j].add(hash_val) # Calc evaluation score eval_score[j] = self.evaluation_metric.evaluate_one_sent( ref_j, hyp_j) * sign # Appending the delta and logprob of this sample prob = logprob + dy.inputTensor(mask, batched=True) deltas.append(dy.inputTensor(eval_score, batched=True)) probs.append(prob) sample_prob = dy.softmax(dy.concatenate(probs)) deltas = dy.concatenate(deltas) risk = dy.sum_elems(dy.cmult(sample_prob, deltas)) ### Debug #print(sample_prob.npvalue().transpose()[0]) #print(deltas.npvalue().transpose()[0]) #print("----------------------") ### End debug return losses.FactoredLossExpr({"risk": risk})
def on_calc_additional_loss(self, translator_loss): if not self.learn_segmentation or self.segment_decisions is None: return None reward = -translator_loss["mle"] if not self.log_reward: reward = dy.exp(reward) reward = dy.nobackprop(reward) # Make sure that reward is not scalar, but rather based on the each batch item assert reward.dim()[1] == len(self.src_sent) # Mask enc_mask = self.enc_mask.get_active_one_mask().transpose() if self.enc_mask is not None else None # Compose the lose ret = LossBuilder() ## Length prior alpha = self.length_prior_alpha.value() if self.length_prior_alpha is not None else 0 if alpha > 0: reward += self.segment_length_prior * alpha # reward z-score normalization if self.z_normalization: reward = dy.cdiv(reward-dy.mean_batches(reward), dy.std_batches(reward) + EPS) ## Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): loss = dy.squared_distance(reward, baseline) if enc_mask is not None: loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), loss) baseline_loss.append(loss) ret.add_loss("Baseline", dy.esum(baseline_loss)) if self.print_sample: print(dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0]) ## Reinforce Loss lmbd = self.lmbd.value() if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - dy.nobackprop(self.bs[i]) else: r_i = reward if enc_mask is not None: ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), ll) reinforce_loss.append(r_i * -ll) loss = dy.esum(reinforce_loss) * lmbd ret.add_loss("Reinforce", loss) if self.confidence_penalty: ls_loss = self.confidence_penalty(self.segment_logsoftmaxes, enc_mask) ret.add_loss("Confidence Penalty", ls_loss) # Total Loss return ret
def forward(self, s1, s2, label=None): eL = dy.parameter(self.embeddingLinear) s1 = dy.inputTensor(s1) * eL s2 = dy.inputTensor(s2) * eL # F step Lf1 = dy.parameter(self.mlpF1) Fs1 = dy.rectify(dy.dropout(s1, 0.2) * Lf1) Fs2 = dy.rectify(dy.dropout(s2, 0.2) * Lf1) Lf2 = dy.parameter(self.mlpF2) Fs1 = dy.rectify(dy.dropout(Fs1, 0.2) * Lf2) Fs2 = dy.rectify(dy.dropout(Fs2, 0.2) * Lf2) # Attention scoring score1 = Fs1 * dy.transpose(Fs2) prob1 = dy.softmax(score1) score2 = dy.transpose(score1) prob2 = dy.softmax(score2) # Align pairs using attention s1Pairs = dy.concatenate_cols([s1, prob1 * s2]) s2Pairs = dy.concatenate_cols([s2, prob2 * s1]) # G step Lg1 = dy.parameter(self.mlpG1) Gs1 = dy.rectify(dy.dropout(s1Pairs, 0.2) * Lg1) Gs2 = dy.rectify(dy.dropout(s2Pairs, 0.2) * Lg1) Lg2 = dy.parameter(self.mlpG2) Gs1 = dy.rectify(dy.dropout(Gs1, 0.2) * Lg2) Gs2 = dy.rectify(dy.dropout(Gs2, 0.2) * Lg2) # Sum Ss1 = dy.sum_dim(Gs1, [0]) Ss2 = dy.sum_dim(Gs2, [0]) concatS12 = dy.transpose(dy.concatenate([Ss1, Ss2])) # H step Lh1 = dy.parameter(self.mlpH1) Hs = dy.rectify(dy.dropout(concatS12, 0.2) * Lh1) Lh2 = dy.parameter(self.mlpH2) Hs = dy.rectify(dy.dropout(Hs, 0.2) * Lh2) # Final layer final_layer = dy.parameter(self.final_layer) final = dy.transpose(Hs * final_layer) # Label can be 0... if label != None: return dy.pickneglogsoftmax(final, label) else: out = dy.softmax(final) return np.argmax(out.npvalue())
def __getitem__(self, key): if self.expr_list or self.expr_tensor: return super(LazyNumpyExpressionSequence, self).__getitem__(key) else: if Batcher.is_batched(self.lazy_data): return dy.inputTensor([ self.lazy_data[batch][key] for batch in range(len(self.lazy_data)) ], batched=True) else: return dy.inputTensor(self.lazy_data[key], batched=False)
def sequence_mask(lengths, max_len=-1): """Build a sequence mask for dynet. This is a bit weird, most of the time we have dynet as H, T so it would seem like we would want the mask to be ((1, T), B) but the only places where we do the masking right now is in attention and it makes sense to have it shaped as ((T, 1), B). """ mask = seq_mask(lengths, max_len) mask = np.expand_dims(np.transpose(mask), 1) inv_mask = (mask == 0).astype(np.uint8) return dy.inputTensor(mask, batched=True), dy.inputTensor(inv_mask, batched=True)
def _emb_mask_generator(seq_len, batch_size): ret = [] for i in xrange(seq_len): word_mask = np.random.binomial(1, 1. - dropout_dim, batch_size).astype(np.float32) tag_mask = np.random.binomial(1, 1. - dropout_dim, batch_size).astype(np.float32) scale = 3. / (2. * word_mask + tag_mask + 1e-12) word_mask *= scale tag_mask *= scale word_mask = dy.inputTensor(word_mask, batched=True) tag_mask = dy.inputTensor(tag_mask, batched=True) ret.append((word_mask, tag_mask)) return ret
def test_sanity(self): for i in range(3): dy.renew_cg() nll = self.sm.neg_log_softmax( dy.inputTensor(np.arange(3)), 4, update=True) nll_const = self.sm.neg_log_softmax( dy.inputTensor(np.arange(3)), 5, update=False) nll = self.sm.neg_log_softmax( dy.inputTensor(np.arange(3)), 6, update=True) nll_const = self.sm.neg_log_softmax( dy.inputTensor(np.arange(3)), 7, update=False) nll.value() nll_const.value()
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x, g, b) l = dy.sum_elems(y) l_value = l.scalar_value() l.backward() y_np_value = self.v2 / self.v1.std() * (self.v1 - self.v1.mean()) + self.v3 self.assertTrue(np.allclose(y.npvalue(), y_np_value))
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x,g,b) l = dy.sum_elems(y) l_value = l.scalar_value() l.backward() y_np_value = self.v2 / self.v1.std() * (self.v1 - self.v1.mean()) + self.v3 self.assertTrue(np.allclose(y.npvalue(),y_np_value))
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x, g, b) loss = dy.sum_elems(y) loss.backward() centered_v1 = self.v1 - self.v1.mean() y_np_value = self.v2 / self.v1.std() * centered_v1 + self.v3 self.assertTrue(np.allclose(y.npvalue(), y_np_value))
def test_cross_entropy_loss_basic(): """ Some simple tests of cross_entropy_loss to get you started. Warning: these are not exhaustive. """ y = np.array([[0, 1], [1, 0], [1, 0]]) yhat = np.array([[.5, .5], [.5, .5], [.5, .5]]) test1 = cross_entropy_loss(dy.inputTensor(y), dy.inputTensor(yhat)) expected = -3 * np.log(.5) test_all_close("Cross-entropy test 1", np.array(test1.value()), expected) print "Basic (non-exhaustive) cross-entropy tests pass"
def dynet_bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): """ Do xWy :param x: (input_size x seq_len) x batch_size :param W: :param y: (input_size x seq_len) x batch_size :param input_size: :param seq_len: :param batch_size: :param num_outputs: :param bias_x: :param bias_y: :return: [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x] x batch_size """ import dynet as dy if isinstance(x, np.ndarray): x = dy.inputTensor(x, batched=True) if isinstance(y, np.ndarray): y = dy.inputTensor(y, batched=True) if isinstance(W, np.ndarray): W = dy.inputTensor(W) if bias_x: x = dy.concatenate( [x, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) if bias_y: y = dy.concatenate( [y, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) nx, ny = input_size + bias_x, input_size + bias_y # W: (num_outputs x ny) x nx lin = W * x if num_outputs > 1: lin = dy.reshape(lin, (ny, num_outputs * seq_len), batch_size=batch_size) blin = dy.transpose(y) * lin if num_outputs > 1: blin = dy.reshape(blin, (seq_len, num_outputs, seq_len), batch_size=batch_size) return blin
def decoding(self, src_encodings): ''' Viterbi decoding for a single sequence. ''' W_src2tag_readout = self.W_src2tag_readout.expr(update=True) b_src2tag_readout = self.b_src2tag_readout.expr(update=True) W_score_tag = self.W_scores_readout2tag.expr(update=True) b_score_tag = self.b_scores_readout2tag.expr(update=True) tag_embs = [ dy.tanh( dy.affine_transform( [b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding in src_encodings ] tag_scores = [ dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs ] back_trace_tags = [] np_init_alpha = np.ones(self.tag_size) * -2000.0 np_init_alpha[self.start_id] = 0.0 max_tm1 = dy.inputTensor(np_init_alpha) transpose_transition_score = self.transition_matrix #.expr(update=True) # (to, from) for i, tag_score in enumerate(tag_scores): max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size) max_t = max_tm1 + transpose_transition_score if i != 0: eval_score = max_t.npvalue()[:-2, :] else: eval_score = max_t.npvalue() best_tag = np.argmax(eval_score, axis=0) back_trace_tags.append(best_tag) max_tm1 = dy.inputTensor( eval_score[best_tag, range(self.tag_size)]) + tag_score terminal_max_T = max_tm1 + self.transition_matrix[self.end_id] eval_terminal = terminal_max_T.npvalue()[:-2] best_tag = np.argmax(eval_terminal, axis=0) best_path_score = eval_terminal[best_tag] best_path = [best_tag] for btpoint in reversed(back_trace_tags): best_tag = btpoint[best_tag] best_path.append(best_tag) start = best_path.pop() assert start == self.start_id best_path.reverse() return best_path_score, best_path
def backward(self, word_vectors, label): dy.renew_cg() x = dy.inputTensor(word_vectors) y = dy.inputTensor(label) logit = self.build_graph(x) # q表示对正样本的加权 # 公式见https://www.tensorflow.org/api_docs/python/tf/nn/weighted_cross_entropy_with_logits q = 15 l = 1 + (q - 1) * y loss = (1 - y) * logit + l * (dy.log(1 + dy.exp(-dy.abs(logit))) + dy.rectify(-logit)) res = loss.value() loss.backward() return res
def weights_to_argmax(self): shape, batch = self.weights.dim() if shape == (1, ): return m_is = numpy.argmax(self.weights.npvalue(), 0) if batch == 1: self.weights = dynet.inputTensor( [-99999 if i != m_is else 99999 for i in range(shape[0])], device=self.device) else: self.weights = dynet.inputTensor( [[-99999 if i != m_i else 99999 for m_i in m_is] for i in range(shape[0])], batched=True, device=self.device) if self.next_layer is not None: self.next_layer.weights_to_argmax()
def test_inputTensor_batched_list(self): for i in range(4): dy.renew_cg() input_tensor = self.input_vals.reshape(self.shapes[i]) xb = dy.inputTensor([np.asarray(x).transpose() for x in input_tensor.transpose()]) self.assertEqual( xb.dim()[0], (self.shapes[i][:-1] if i > 0 else (1,)), msg="Dimension mismatch" ) self.assertEqual( xb.dim()[1], self.shapes[i][-1], msg="Dimension mismatch" ) self.assertTrue( np.allclose(xb.npvalue(), input_tensor), msg="Expression value different from initial value" ) self.assertEqual( dy.sum_batches(dy.squared_norm(xb)).scalar_value(), self.squared_norm, msg="Value mismatch" )
def predict_label(self, frames): # Renew the computation graph dy.renew_cg() # Initialize LSTM init_state_src = self.lstm_builder.initial_state() # Instantiate the params W_mean = dy.parameter(self.W_mean_p) V_mean = dy.parameter(self.V_mean_p) b_mean = dy.parameter(self.b_mean_p) W_var = dy.parameter(self.W_var_p) V_var = dy.parameter(self.V_var_p) b_var = dy.parameter(self.b_var_p) input_frames = dy.inputTensor(frames) src_output = init_state_src.add_inputs([frame for frame in input_frames])[-1].output() # Get the mean and diagonal log covariance from the encoder mu = self.mlp(src_output , W_mean, V_mean, b_mean) log_var = self.mlp(src_output , W_mean, V_mean, b_mean) # Reparameterize z = self.reparameterize(mu, log_var) W_sm = dy.parameter(self.W_sm_p) b_sm = dy.parameter(self.b_sm_p) pred = dy.affine_transform([b_sm, W_sm, z]) return dy.softmax(pred)
def _policy_shape_probs(self, prob_dist): # TODO: this is specific to Alchemy num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 new_probdist = dy.zeros(prob_dist.dim()[0]) zeroes = numpy.zeros(num_locations * num_arguments) ones = numpy.ones(num_locations * num_arguments) eos_prob = prob_dist[self._all_output_vocabulary.lookup_index((EOS, NO_ARG, NO_ARG))] action_idx = 0 for action in self.output_action_vocabulary: masks = numpy.concatenate( (numpy.repeat(zeroes, action_idx), ones, numpy.repeat(zeroes, num_actions - action_idx - 1))) actions_masks = dy.reshape(dy.inputTensor(masks), (num_actions * num_locations * num_arguments, 1)) if action == EOS: new_probdist += dy.cmult(actions_masks, prob_dist) / 2. elif action == "push": new_probdist += dy.cmult(actions_masks, prob_dist) + eos_prob / (2. * 56.) elif action == "pop": new_probdist += dy.cmult(actions_masks, prob_dist) if self.args.syntax_restricted: return dy.exp(dy.log_softmax(dy.cmult(new_probdist, prob_dist), restrict = self._valid_action_indices)) else: return dy.softmax(dy.cmult(new_probdist, prob_dist))
def predict_label(self, embedding): # Renew the computation graph dy.renew_cg() # Instantiate the params W_mean = dy.parameter(self.W_mean_p) V_mean = dy.parameter(self.V_mean_p) b_mean = dy.parameter(self.b_mean_p) W_var = dy.parameter(self.W_var_p) V_var = dy.parameter(self.V_var_p) b_var = dy.parameter(self.b_var_p) input_embedding = dy.inputTensor(embedding) # Get the DNN encoding src_output = self.dnn.predict(input_embedding) # Get the mean and diagonal log covariance from the encoder mu = self.mlp(src_output , W_mean, V_mean, b_mean) log_var = self.mlp(src_output , W_mean, V_mean, b_mean) # Reparameterize z = self.reparameterize(mu, log_var) W_sm = dy.parameter(self.W_sm_p) b_sm = dy.parameter(self.b_sm_p) pred = dy.affine_transform([b_sm, W_sm, z]) return dy.softmax(pred)
def train(epoch): model.training = True train_loss = 0 train_loader = generate_batch_loader(train_data, batch_size=batch_size) for batch_idx, data in enumerate(train_loader): # Dymanic Construction of Graph dy.renew_cg() x = dy.inputTensor(data.reshape(-1, 784).T) recon_x, mu, logvar = model.forward(x) loss = loss_function(recon_x, x, mu, logvar) # Forward loss_value = loss.value() train_loss += loss_value # Backward loss.backward() optimizer.update() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_data), 100. * batch_idx / (len(train_data) / batch_size), loss_value / len(data))) print('====> Epoch: {} Average loss: {:.4f}'.format( epoch, train_loss / len(train_data)))
def test(epoch): model.training = False test_loss = 0 test_loader = generate_batch_loader(test_data, batch_size=batch_size) for i, data in enumerate(test_loader): # Dymanic Construction of Graph dy.renew_cg() x = dy.inputTensor(data.reshape(-1, 784).T) recon_x, mu, logvar = model.forward(x) loss = loss_function(recon_x, x, mu, logvar) # Forward loss_value = loss.value() test_loss += loss_value if i == 0: n = min(data.shape[0], 8) comparison = np.concatenate([ data[:n], recon_x.npvalue().T.reshape(batch_size, 1, 28, 28)[:n] ]) save_image(comparison, 'results/reconstruction_' + str(epoch) + '.png', nrow=n) test_loss /= len(test_data) print('====> Test set loss: {:.4f}'.format(test_loss))
def predict(self, batch_dict): dy.renew_cg() inputs = self.make_input(batch_dict) lengths = inputs['lengths'] unaries = self.compute_unaries(inputs) if self.do_crf is True: best_path, path_score = self.crf.decode(unaries) elif self.constraint is not None: best_path, path_score = viterbi( unaries, dy.log_softmax(dy.inputTensor(self.constraint[1] * -1e4)), Offsets.GO, Offsets.EOS, norm=True ) else: best_path = [np.argmax(x.npvalue(), axis=0) for x in unaries] # TODO: RN using autobatching, so none of this is really useful # If we want to support batching in this function we have to either loop over the batch # or we can just simplify all this code here best_path = np.stack(best_path).reshape(-1, 1) # (T, B) best_path = best_path.transpose(1, 0) results = [] for b in range(best_path.shape[0]): sentence = best_path[b, :lengths[b]] results.append(sentence) return results
def test(epoch): model.training = False test_loss = 0 test_loader = generate_batch_loader(test_data, batch_size=batch_size) for i, data in enumerate(test_loader): # Dymanic Construction of Graph dy.renew_cg() x = dy.inputTensor(data.reshape(-1, 784).T) recon_x, mu, logvar = model.forward(x) loss = loss_function(recon_x, x, mu, logvar) # Forward loss_value = loss.value() test_loss += loss_value if i == 0: n = min(data.shape[0], 8) comparison = np.concatenate([data[:n], recon_x.npvalue().T.reshape(batch_size, 1, 28, 28)[:n]]) save_image(comparison, 'results/reconstruction_' + str(epoch) + '.png', nrow=n) test_loss /= len(test_data) print('====> Test set loss: {:.4f}'.format(test_loss))
def subsequent_mask(T): """Build a mask to hide the future from self attention. Output: ((T, T, 1), 1) to broadcast over both the heads and the batch Returns: (dy.Expression, dy.Expression) - The first mask has ones in valid positions and zeros at invalid. This is used to zero out the future using a `dy.cmult`. - The second mask has 1 at invalid positions and zeros at valid. This can be used to fill invalid positions with negative numbers via addition. """ mask = np.triu(np.ones((T, T))).astype(np.uint8) mask = np.expand_dims(mask, -1) inv_mask = (mask == 0).astype(np.uint8) return dy.inputTensor(mask), dy.inputTensor(inv_mask)
def dyagonalize(col): """ A convoluted way to make a dynet vector into a dynet matrix where it's the diagonal God I hope there's a better way. :param col: column vector in dynet format """ col_dim = col.dim()[0][0] nump_eye = np.eye(col_dim) return dy.cmult(col, dy.inputTensor(nump_eye))
def learn(self, batch_size): if self.prioritized: if not self.memory.is_full(): return -np.inf indices, exps, weights = self.memory.sample(batch_size, self.beta) else: exps = self.memory.sample(batch_size) obss, actions, rewards, obs_nexts, dones = self._process(exps) dy.renew_cg() target_network = self.target_network if self.use_double_dqn else self.network if self.dueling: target_values, v = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() + v.npvalue() else: target_values = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() target_values = np.max(target_values, axis=0) target_values = rewards + self.reward_decay * (target_values * (1 - dones)) dy.renew_cg() if self.dueling: all_values_expr, v = self.network(obss, batched=True) else: all_values_expr = self.network(obss, batched=True) picked_values = dy.pick_batch(all_values_expr, actions) diff = (picked_values + v if self.dueling else picked_values) - dy.inputTensor(target_values, batched=True) if self.prioritized: self.memory.update(indices, np.transpose(np.abs(diff.npvalue()))) losses = dy.pow(diff, dy.constant(1, 2)) if self.prioritized: losses = dy.cmult(losses, dy.inputTensor(weights, batched=True)) loss = dy.sum_batches(losses) loss_value = loss.npvalue() loss.backward() self.trainer.update() self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_lower) if self.prioritized: self.beta = min(self.beta + self.beta_increase, 1.) self.learn_step += 1 if self.use_double_dqn and self.learn_step % self.n_replace_target == 0: self.target_network.update(self.network) return loss_value
def test_inputTensor_not_batched(self): for i in range(4): dy.renew_cg() input_tensor = self.input_vals.reshape(self.shapes[i]) x = dy.inputTensor(input_tensor) self.assertEqual(x.dim()[0], self.shapes[i], msg="Dimension mismatch") self.assertEqual(x.dim()[1], 1, msg="Dimension mismatch") self.assertTrue(np.allclose(x.npvalue(), input_tensor), msg="Expression value different from initial value") self.assertEqual(dy.squared_norm(x).scalar_value(), self.squared_norm, msg="Value mismatch")
def __call__(self, obs, batched=False): out = obs if isinstance(obs, dy.Expression) else dy.inputTensor(obs, batched=batched) for i in range(self.n_layers): b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i]) out = dy.affine_transform([b, W, out]) if self.layer_norm and i != self.n_layers - 1: out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i]) if self.specified_activation: if self.activation[i] is not None: out = self.activation[i](out) else: out = self.activation(out) return out
def test_grad(self): # add parameter p = self.m.parameters_from_numpy(np.arange(5)) # create cg dy.renew_cg() # input tensor x = dy.inputTensor(np.arange(5).reshape((1, 5))) # compute dot product res = x * p # Run forward and backward pass res.forward() res.backward() # Should print the value of x self.assertTrue(np.allclose(p.grad_as_array(), x.npvalue()), msg="Gradient is wrong")
def test_slicing(self): dy.renew_cg() data = np.random.random((10, 10, 10)) self.assertTrue(np.allclose(dy.inputTensor( data)[:1, :2, :3].npvalue(), data[:1, :2, :3])) self.assertTrue(np.allclose(dy.inputTensor(data, batched=True)[ :1, :2, :3].npvalue(), data[:1, :2, :3])) self.assertTrue(np.allclose(dy.inputTensor( data)[:, :, :3].npvalue(), data[:, :, :3])) self.assertTrue(np.allclose(dy.inputTensor( data)[3:, :, :].npvalue(), data[3:, :, :])) self.assertTrue(np.allclose(dy.inputTensor( data)[:, :, ::1].npvalue(), data[:, :, ::1])) self.assertTrue(np.allclose(dy.inputTensor( data)[:, :, ::3].npvalue(), data[:, :, ::3])) self.assertTrue(np.allclose(dy.inputTensor( data)[3:5, 1:3, 1:].npvalue(), data[3:5, 1:3, 1:]))
def rnn_forward_with_state(rnn, input_, lengths=None, state=None, batched=True, backward=False): """Return the output of the final layers and the final state of the RNN. :param rnn: dy.RNNBuilder :param input_: List[dy.Expression] :param lengths: List[int] :param state: List[np.ndarray] The previous state (used in TBPTT) :param batched: bool Is the state batched? :param backward: bool Is this a backward rnn in a bRNN? Returns: List[dy.Expression] (Seq_len): The outputs List[dy.Expression] (2 * layers if lstm): The state """ if state is not None: state = [dy.inputTensor(s, batched) for s in state] lstm_state = rnn.initial_state(state) if backward: states = lstm_state.add_inputs(reversed(input_)) outputs = list(reversed([s.h()[-1] for s in states])) # When going backwards (we pad right) the final state of the rnn # is always the last one. final_state = states[-1].s() return outputs, final_state states = lstm_state.add_inputs(input_) outputs = [s.h()[-1] for s in states] if lengths is None: if backward: outputs = list(reversed(outputs)) return outputs, states[-1].s() final_states = [states[l - 1].s() for l in lengths] final_state_by_batch = [] for i, state in enumerate(final_states): batch_state = [dy.pick_batch_elem(s, i) for s in state] final_state_by_batch.append(batch_state) final_state = [] for i in range(len(final_state_by_batch[0])): col = dy.concatenate_to_batch([final_state_by_batch[j][i] for j in range(len(final_state_by_batch))]) final_state.append(col) if backward: outputs = list(reversed(outputs)) return outputs, final_state
def __call__(self, inputs, dropout=False): x = dy.inputTensor(inputs) conv1 = dy.parameter(self.pConv1) b1 = dy.parameter(self.pB1) x = dy.conv2d_bias(x, conv1, b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) conv2 = dy.parameter(self.pConv2) b2 = dy.parameter(self.pB2) x = dy.conv2d_bias(x, conv2, b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.reshape(x, (7*7*64, 1)) w1 = dy.parameter(self.pW1) b3 = dy.parameter(self.pB3) h = dy.rectify(w1*x+b3) if dropout: h = dy.dropout(h, DROPOUT_RATE) w2 = dy.parameter(self.pW2) output = w2*h # output = dy.softmax(w2*h) return output
def test_value_sanity(self): dy.renew_cg() x=dy.inputTensor(self.v1) dy.renew_cg() self.assertRaises(RuntimeError, npvalue_callable, x)
def test_value(self): dy.renew_cg() x=dy.inputTensor(self.v1) self.assertTrue(np.allclose(x.npvalue(), self.v1))
n = min(data.shape[0], 8) comparison = np.concatenate([data[:n], recon_x.npvalue().T.reshape(batch_size, 1, 28, 28)[:n]]) save_image(comparison, 'results/reconstruction_' + str(epoch) + '.png', nrow=n) test_loss /= len(test_data) print('====> Test set loss: {:.4f}'.format(test_loss)) import time tictocs = [] for epoch in range(1, args.epochs + 1): tic = time.time() train(epoch) test(epoch) sample = dy.inputTensor(np.random.randn(20, 64)) sample = model.decode(sample) save_image(sample.npvalue().T.reshape(64, 1, 28, 28), 'results/sample_' + str(epoch) + '.png') toc = time.time() tictocs.append(toc - tic) print('############\n\n') print('Total Time Cost:', np.sum(tictocs)) print('Epoch Time Cost', np.average(tictocs), '+-', np.std(tictocs) / np.sqrt(len(tictocs))) print('\n\n############')
# It can be improved by following the speed tricks covered in class: # 1) Don't repeat operations. # 2) Minimize the number of operations. # 3) Minimize the number of CPU-GPU memory copies, make them earlier. # Create the model model = dy.ParameterCollection() trainer = dy.SimpleSGDTrainer(model) W = model.add_parameters((100,100)) # Create the "training data" x_vecs = [] y_vecs = [] for i in range(10): x_vecs.append(np.random.rand(100)) y_vecs.append(np.random.rand(100)) # Do the processing for my_iter in range(1000): dy.renew_cg() total = 0 for x in x_vecs: for y in y_vecs: x_exp = dy.inputTensor(x) y_exp = dy.inputTensor(y) total = total + dy.dot_product(W * x_exp, y_exp) total.forward() total.backward() trainer.update()
def transitions(self): if self.mask is not None: return dy.cmult(self.transitions_p, dy.inputTensor(self.mask)) + dy.inputTensor(self.inv_mask) return self.transitions_p