def training_step(self, src, trg): """ Performs forward pass, backward pass, parameter update for the given minibatch """ loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) if standard_loss.__class__ == LossBuilder: loss = None for loss_name, loss_expr in standard_loss.loss_nodes: loss_builder.add_loss(loss_name, loss_expr) loss = loss_expr if not loss else loss + loss_expr standard_loss = loss else: loss_builder.add_loss("loss", standard_loss) additional_loss = self.model.calc_additional_loss( dy.nobackprop(-standard_loss)) if additional_loss != None: loss_builder.add_loss("additional_loss", additional_loss) loss_value = loss_builder.compute() self.logger.update_epoch_loss(src, trg, loss_builder) self.logger.report_train_process() return loss_value
def calc_loss(self, src, trg, loss_calculator): sub_losses = collections.defaultdict(list) for model in self.models: for loss_name, loss in model.calc_loss(src, trg, loss_calculator).loss_values.items(): sub_losses[loss_name].append(loss) model_loss = LossBuilder() for loss_name, losslist in sub_losses.items(): # TODO: dy.average(losslist) _or_ dy.esum(losslist) / len(self.models) ? # -- might not be the same if not all models return all losses model_loss.add_loss(loss_name, dy.average(losslist)) return model_loss
def calc_loss(self, src, trg, loss_calculator): self.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder(embeddings) self.attender.init_sent(encodings) # Initialize the hidden state from the encoder ss = mark_as_batch([Vocab.SS] * len(src)) if is_batched(src) else Vocab.SS dec_state = self.decoder.initial_state(self.encoder.get_final_states(), self.trg_embedder.embed(ss)) # Compose losses model_loss = LossBuilder() model_loss.add_loss("mle", loss_calculator(self, dec_state, src, trg)) if self.calc_global_fertility or self.calc_attention_entropy: # philip30: I assume that attention_vecs is already masked src wisely. # Now applying the mask to the target masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = trg.mask.get_active_one_mask().transpose() masked_attn = [ dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask) ] if self.calc_global_fertility: model_loss.add_loss("fertility", self.global_fertility(masked_attn)) if self.calc_attention_entropy: model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn)) return model_loss
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def eval(self): if self.src_data == None: self.src_data, self.ref_data, self.src_batches, self.ref_batches = \ xnmt.input_reader.read_parallel_corpus(self.model.src_reader, self.model.trg_reader, self.src_file, self.ref_file, batcher=self.batcher, max_src_len=self.max_src_len, max_trg_len=self.max_trg_len) loss_val = LossScalarBuilder() ref_words_cnt = 0 for src, trg in zip(self.src_batches, self.ref_batches): dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) additional_loss = self.model.calc_additional_loss(standard_loss) loss_builder.add_loss("standard_loss", standard_loss) loss_builder.add_loss("additional_loss", additional_loss) ref_words_cnt += self.model.trg_reader.count_words(trg) loss_val += loss_builder.get_loss_stats() loss_stats = {k: v/ref_words_cnt for k, v in loss_val.items()} try: return LossScore(loss_stats[self.model.get_primary_loss()], loss_stats=loss_stats, desc=self.desc), ref_words_cnt except KeyError: raise RuntimeError("Did you wrap your loss calculation with LossBuilder({'primary_loss': loss_value}) ?")
def one_epoch(self, update_weights=True): """ :param update_weights: Whether to perform backward pass & update weights (useful for debugging) """ self.logger.new_epoch() if self.args["reload_command"] is not None: self._augment_data_next_epoch() self.model.set_train(update_weights) order = list(range(0, len(self.train_src))) np.random.shuffle(order) for batch_num in order: src = self.train_src[batch_num] trg = self.train_trg[batch_num] # Loss calculation dy.renew_cg() loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg) if standard_loss.__class__ == LossBuilder: loss = None for loss_name, loss_expr in standard_loss.loss_nodes: loss_builder.add_loss(loss_name, loss_expr) loss = loss_expr if not loss else loss + loss_expr standard_loss = loss else: loss_builder.add_loss("loss", standard_loss) additional_loss = self.model.calc_additional_loss( dy.nobackprop(-standard_loss)) if additional_loss != None: loss_builder.add_loss("additional_loss", additional_loss) # Log the loss sum loss_value = loss_builder.compute() self.logger.update_epoch_loss(src, trg, loss_builder) if update_weights: loss_value.backward() self.trainer.update() # Devel reporting self.logger.report_train_process() if self.logger.should_report_dev(): self.dev_evaluation() self.model.new_epoch()
def on_calc_additional_loss(self, translator_loss): if not self.learn_segmentation or self.segment_decisions is None: return None reward = -translator_loss["mle"] if not self.log_reward: reward = dy.exp(reward) reward = dy.nobackprop(reward) # Make sure that reward is not scalar, but rather based on the each batch item assert reward.dim()[1] == len(self.src_sent) # Mask enc_mask = self.enc_mask.get_active_one_mask().transpose() if self.enc_mask is not None else None # Compose the lose ret = LossBuilder() ## Length prior alpha = self.length_prior_alpha.value() if self.length_prior_alpha is not None else 0 if alpha > 0: reward += self.segment_length_prior * alpha # reward z-score normalization if self.z_normalization: reward = dy.cdiv(reward-dy.mean_batches(reward), dy.std_batches(reward) + EPS) ## Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): loss = dy.squared_distance(reward, baseline) if enc_mask is not None: loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), loss) baseline_loss.append(loss) ret.add_loss("Baseline", dy.esum(baseline_loss)) if self.print_sample: print(dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0]) ## Reinforce Loss lmbd = self.lmbd.value() if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - dy.nobackprop(self.bs[i]) else: r_i = reward if enc_mask is not None: ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), ll) reinforce_loss.append(r_i * -ll) loss = dy.esum(reinforce_loss) * lmbd ret.add_loss("Reinforce", loss) if self.confidence_penalty: ls_loss = self.confidence_penalty(self.segment_logsoftmaxes, enc_mask) ret.add_loss("Confidence Penalty", ls_loss) # Total Loss return ret
def calc_loss(self, src, trg, loss_cal=None, infer_prediction=False): self.start_sent(src) if not xnmt.batcher.is_batched(src): src = xnmt.batcher.mark_as_batch([src]) if not xnmt.batcher.is_batched(trg): trg = xnmt.batcher.mark_as_batch([trg]) src_words = np.array([[Vocab.SS] + x.words for x in src]) batch_size, src_len = src_words.shape if isinstance(src.mask, type(None)): src_mask = np.zeros((batch_size, src_len), dtype=np.int) else: src_mask = np.concatenate([ np.zeros((batch_size, 1), dtype=np.int), src.mask.np_arr.astype(np.int) ], axis=1) src_embeddings = self.sentence_block_embed( self.src_embedder.embeddings, src_words, src_mask) src_embeddings = self.make_input_embedding(src_embeddings, src_len) trg_words = np.array( list(map(lambda x: [Vocab.SS] + x.words[:-1], trg))) batch_size, trg_len = trg_words.shape if isinstance(trg.mask, type(None)): trg_mask = np.zeros((batch_size, trg_len), dtype=np.int) else: trg_mask = trg.mask.np_arr.astype(np.int) trg_embeddings = self.sentence_block_embed( self.trg_embedder.embeddings, trg_words, trg_mask) trg_embeddings = self.make_input_embedding(trg_embeddings, trg_len) xx_mask = self.make_attention_mask(src_mask, src_mask) xy_mask = self.make_attention_mask(trg_mask, src_mask) yy_mask = self.make_attention_mask(trg_mask, trg_mask) yy_mask *= self.make_history_mask(trg_mask) z_blocks = self.encoder(src_embeddings, xx_mask) h_block = self.decoder(trg_embeddings, z_blocks, xy_mask, yy_mask) if infer_prediction: y_len = h_block.dim()[0][1] last_col = dy.pick(h_block, dim=1, index=y_len - 1) logits = self.decoder.output(last_col) return logits ref_list = list( itertools.chain.from_iterable(map(lambda x: x.words, trg))) concat_t_block = (1 - trg_mask.ravel()).reshape(-1) * np.array(ref_list) loss = self.decoder.output_and_loss(h_block, concat_t_block) return LossBuilder({"mle": loss})
def compute_dev_loss(self): loss_builder = LossBuilder() trg_words_cnt = 0 for src, trg in zip(self.dev_src, self.dev_trg): dy.renew_cg() standard_loss = self.model.calc_loss(src, trg) loss_builder.add_loss("loss", standard_loss) trg_words_cnt += self.logger.count_trg_words(trg) loss_builder.compute() return trg_words_cnt, LossScore(loss_builder.sum() / trg_words_cnt)
def on_calc_additional_loss(self, reward): if not self.learn_segmentation: return None ret = LossBuilder() if self.length_prior_alpha > 0: reward += self.segment_length_prior * self.length_prior_alpha reward = dy.cdiv(reward - dy.mean_batches(reward), dy.std_batches(reward)) # Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): baseline_loss.append(dy.squared_distance(reward, baseline)) ret.add_loss("Baseline", dy.esum(baseline_loss)) # Reinforce Loss lmbd = self.lmbd.get_value(self.warmup_counter) if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - self.bs[i] else: r_i = reward reinforce_loss.append(dy.logistic(r_i) * ll) ret.add_loss("Reinforce", -dy.esum(reinforce_loss) * lmbd) # Total Loss return ret
def training_step(self, src, trg): """ Performs forward pass, backward pass, parameter update for the given minibatch """ loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) additional_loss = self.model.calc_additional_loss(standard_loss) loss_builder.add_loss("standard_loss", standard_loss) loss_builder.add_loss("additional_loss", additional_loss) loss_value = loss_builder.compute() self.logger.update_epoch_loss(src, trg, loss_builder.get_loss_stats()) self.logger.report_train_process() return loss_value
def training_step(self, src, trg): """ Performs forward pass, backward pass, parameter update for the given minibatch """ loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) additional_loss = self.model.calc_additional_loss(standard_loss) loss_builder.add_loss("standard_loss", standard_loss) loss_builder.add_loss("additional_loss", additional_loss) return loss_builder
def __call__(self, translator, initial_state, src, trg): batch_size = len(trg) uniques = [set() for _ in range(batch_size)] deltas = [] probs = [] search_outputs = translator.search_strategy.generate_output( translator, initial_state, forced_trg_ids=trg) for search_output in search_outputs: logprob = search_output.logsoftmaxes sample = search_output.word_ids attentions = search_output.attentions logprob = dy.esum(logprob) * self.alpha # Calculate the evaluation score eval_score = np.zeros(batch_size, dtype=float) mask = np.zeros(batch_size, dtype=float) for j in range(batch_size): ref_j = self.remove_eos(trg[j].words) hyp_j = self.remove_eos(sample[j].tolist()) if self.unique_sample: hash_val = hash(tuple(hyp_j)) if len(hyp_j) == 0 or hash_val in uniques[j]: mask[j] = -INFINITY continue else: # Count this sample in uniques[j].add(hash_val) # Calc evaluation score eval_score[j] = self.evaluation_metric.evaluate(ref_j, hyp_j) * \ (-1 if self.inv_eval else 1) # Appending the delta and logprob of this sample prob = logprob + dy.inputTensor(mask, batched=True) deltas.append(dy.inputTensor(eval_score, batched=True)) probs.append(prob) sample_prob = dy.softmax(dy.concatenate(probs)) deltas = dy.concatenate(deltas) risk = dy.sum_elems(dy.cmult(sample_prob, deltas)) ### Debug #print(sample_prob.npvalue().transpose()[0]) #print(deltas.npvalue().transpose()[0]) #print("----------------------") ### End debug return LossBuilder({"risk": risk})
def __call__(self, translator, initial_state, src, trg): # TODO(philip30): currently only using the best hypothesis / first sample for reinforce loss # A small further implementation is needed if we want to do reinforce with multiple samples. search_output = translator.search_strategy.generate_output( translator, initial_state)[0] # Calculate evaluation scores self.eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = self.remove_eos(trg_i.words) # Evaluating if len(sample_i) == 0: score = 0 else: score = self.evaluation_metric.evaluate(ref_i, sample_i) * \ (-1 if self.inv_eval else 1) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) # Composing losses loss = LossBuilder() if self.use_baseline: baseline_loss = [] losses = [] for state, logsoft, mask in zip(search_output.state, search_output.logsoftmaxes, search_output.mask): bs_score = self.baseline(state) baseline_loss.append( dy.squared_distance(self.true_score, bs_score)) loss_i = dy.cmult(logsoft, self.true_score - bs_score) losses.append( dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) else: loss.add_loss( "reinforce", dy.sum_elems(dy.cmult(self.true_score, dy.esum(logsofts)))) return loss
def calc_loss(self, src, trg, loss_calculator): self.start_sent(src) tokens = [x[0] for x in src] transitions = [x[1] for x in src] print("Current Batch: " + str(len(tokens)) + " pairs.\n") is_batched = xnmt.batcher.is_batched(src) tokens = xnmt.batcher.mark_as_batch(tokens) embeddings = self.src_embedder.embed_sent(tokens) encodings = self.encoder(embeddings, transitions) self.attender.init_sent(encodings) #import pdb;pdb.set_trace() # Initialize the hidden state from the encoder ss = mark_as_batch( [Vocab.SS] * len(tokens)) if xnmt.batcher.is_batched(src) else Vocab.SS dec_state = self.decoder.initial_state(self.encoder._final_states, self.trg_embedder.embed(ss)) # Compose losses model_loss = LossBuilder() loss, wer = loss_calculator(self, dec_state, src, trg) model_loss.add_loss("mle", loss) print("wer_b:" + str(wer)) if self.calc_global_fertility or self.calc_attention_entropy: # philip30: I assume that attention_vecs is already masked src wisely. # Now applying the mask to the target masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = trg.mask.get_active_one_mask().transpose() masked_attn = [ dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask) ] if self.calc_global_fertility: model_loss.add_loss("fertility", self.global_fertility(masked_attn)) if self.calc_attention_entropy: model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn)) return model_loss