def eval_samples(raw_samples, tokenizer): """Evaluates generated samples.""" gt_refs = [] samples = [] groups = group_samples(raw_samples, tokenizer) groups = list(groups.values()) avg_group_size = np.mean([len(g[-1]) for g in groups]) logging.info('Average samples per example: %.2f', avg_group_size) avg_group_size = int(math.ceil(avg_group_size)) for i, (gt, s) in enumerate(groups): gt_refs.append(gt) idx = i % len(groups) samples.append(groups[idx][-1]) gt_bleu, gt_n_grams = utils.compute_bleu(samples, gt_refs) logging.info('Processed %d samples in total.', sum([len(s) for s in samples])) flat_samples = [] for s in samples: flat_samples.extend(s) logging.info('Average sample len: %.2f', np.mean([len(s) for s in flat_samples])) logging.info('Average ground-truth len: %.2f', np.mean([len(gt) for gt in gt_refs])) logging.info('Ground-truth BLEU: %6.2f, n-gram precision: (%s)', gt_bleu * 100, ', '.join(['%6.2f%%' % (s * 100) for s in gt_n_grams]))
def validate(model, dev_data, vocab_src, vocab_tgt, epoch, config, direction=None): model.eval() device = torch.device( "cpu") if config["device"] == "cpu" else torch.device("cuda:0") with torch.no_grad(): model_hypotheses = [] references = [] val_dl = DataLoader(dev_data, batch_size=config["batch_size_eval"], shuffle=False, num_workers=4) val_dl = BucketingParallelDataLoader(val_dl) for sentences_x, sentences_y in val_dl: if direction == None or direction == "xy": x_in, _, x_mask, x_len = create_batch(sentences_x, vocab_src, device) x_mask = x_mask.unsqueeze(1) else: x_in, _, x_mask, x_len = create_batch(sentences_y, vocab_src, device) x_mask = x_mask.unsqueeze(1) enc_output, enc_hidden = model.encode(x_in, x_len) dec_hidden = model.init_decoder(enc_output, enc_hidden) raw_hypothesis = beam_search(model.decoder, model.emb_tgt, model.generate_tm, enc_output, dec_hidden, x_mask, vocab_tgt.size(), vocab_tgt[SOS_TOKEN], vocab_tgt[EOS_TOKEN], vocab_tgt[PAD_TOKEN], config) hypothesis = batch_to_sentences(raw_hypothesis, vocab_tgt) model_hypotheses += hypothesis.tolist() if direction == None or direction == "xy": references += sentences_y.tolist() else: references += sentences_x.tolist() save_hypotheses(model_hypotheses, epoch, config) model_hypotheses, references = clean_sentences(model_hypotheses, references, config) bleu = compute_bleu(model_hypotheses, references, epoch, config, direction) return bleu
return parser if __name__ == '__main__': # generate parser / parse parameters parser = get_parser() params = parser.parse_args() # check parameters assert os.path.isfile(params.ref) assert os.path.isfile(params.hyp) refs = [] with open(params.ref) as ref: tmp = [] for line in ref.readlines(): if line != "\n": tmp.append(line.strip().split()) else: refs.append(tmp) tmp = [] hyps = [line.strip().split() for line in open(params.hyp).readlines()] r = compute_bleu(reference_corpus=refs, translation_corpus=hyps, max_order=params.max_order, smooth=params.smooth) print(r)
def validate(self, val_loader, epoch=0): self.model.eval() val_loss = 0.0 total_acc = 0.0 total_recall = 0.0 total_precision = 0.0 total_f1 = 0.0 total_cm = 0 total_d_acc = 0.0 bleu = 0.0 total_l1 = 0 total_l2 = 0 total_l3 = 0 k_vals = [1, 2, 3, 4, 5] total_topk = {k: 0.0 for k in k_vals} per_disease_topk = defaultdict(lambda: {str(k): 0.0 for k in k_vals}) per_disease_bleu = defaultdict(list) with torch.no_grad(): for i, (_, images, labels, f_labels, text) in enumerate(val_loader): batch_size = images.size(0) images = images.to(self.device) labels = labels.to(self.device) f_labels = f_labels.to(self.device) text = text.to(self.device) diseases, fine_diseases, text_pred = self.model(images, text) loss1 = self.criterion(diseases, labels) loss2 = self.criterion(fine_diseases, f_labels) text_loss = 0.0 for k in range(text_pred.size(1)): text_loss += self.criterion(text_pred[:, k].squeeze(), text[:, k + 1].squeeze()) val_loss += torch.stack( (loss1, loss2, text_loss))[self.tasks].sum() preds = F.log_softmax(fine_diseases, dim=-1) pred = preds.argmax(dim=-1) d_pred = F.log_softmax(diseases, dim=-1).argmax(dim=-1) # Evaluation of P, R, F1, CM, BLEU total_acc += (pred.eq(f_labels).sum().item() / batch_size) total_d_acc += (d_pred.eq(labels).sum().item() / batch_size) acc, recall, precision, f1 = accuracy_recall_precision_f1( d_pred, labels) cm = calculate_confusion_matrix(d_pred, labels) try: total_cm += (cm / batch_size) except: print("Error occured for this CM") print(cm / batch_size) # Top-k evaluation for k in k_vals: total_topk[k] += compute_topk(preds, f_labels, k) for d in [0, 1, 2, 3]: mask = labels.eq(d) if mask.sum() > 0: per_disease_topk[d][str(k)] += compute_topk( preds[mask], f_labels[mask], k) total_recall += np.mean(recall) total_precision += np.mean(precision) total_f1 += np.mean(f1) preds = torch.argmax(F.log_softmax(text_pred, dim=-1), dim=-1) text1 = text[:, 1:].squeeze().tolist() preds1 = preds.tolist() t_bleu, sent_gt, sent_pred = compute_bleu( self.lang, text1, preds1, labels, per_disease_bleu) # Book-keeping bleu += t_bleu total_l1 += loss1.item() total_l2 += loss2.item() total_l3 += text_loss.item() bleu = bleu / (len(val_loader)) val_loss = val_loss / len(val_loader) total_l1 /= len(val_loader) total_l2 /= len(val_loader) total_l3 /= len(val_loader) total_acc = total_acc / len(val_loader) total_d_acc = total_d_acc / len(val_loader) total_f1 = total_f1 / len(val_loader) total_precision = total_precision / len(val_loader) total_recall = total_recall / len(val_loader) total_cm = total_cm / len(val_loader) self.scheduler.step(val_loss) if val_loss <= self.min_val_loss: torch.save(self.model.state_dict(), self.save_path) self.min_val_loss = val_loss disease_f1 = {} disease_precision = {} disease_recall = {} #for i in range(len(total_f1)): # disease_f1[i] = total_f1[i] # disease_precision[i] = total_precision[i] # disease_recall[i] = total_recall[i] for d in per_disease_bleu: per_disease_bleu[d] = np.mean(per_disease_bleu[d]) total_topk = {str(k): total_topk[k] / len(val_loader) for k in k_vals} for d in [0, 1, 2, 3]: for k in k_vals: per_disease_topk[d][str( k)] = per_disease_topk[d][str(k)] / len(val_loader) return (val_loss, total_d_acc, total_acc, bleu, total_f1, total_recall, total_precision, sent_gt, sent_pred, total_topk, per_disease_topk, per_disease_bleu, total_cm)
def main(): global args, max_length args = parser.parse_args() if args.eval: if not os.path.exists(args.output_dir): print("Output directory do not exists") exit(0) try: model = EncoderDecoder().load(args.output_dir) print("Model loaded successfully") except: print("The trained model could not be loaded...") exit() test_pairs = readFile(args.test_file) outputs = model.evaluatePairs(test_pairs, rand=False, char=args.char) writeToFile(outputs, os.path.join(args.output_dir, "output.pkl")) reference = [] hypothesis = [] for (hyp, ref) in outputs: if args.char or args.char_bleu: reference.append([list(ref)]) hypothesis.append(list(hyp)) else: reference.append([ref.split(" ")]) hypothesis.append(hyp.split(" ")) bleu_score = compute_bleu(reference, hypothesis) print("Bleu Score: " + str(bleu_score)) print( model.evaluateAndShowAttention( "L'anglais n'est pas facile pour nous.", char=args.char)) print( model.evaluateAndShowAttention( "J'ai dit que l'anglais est facile.", char=args.char)) print( model.evaluateAndShowAttention( "Je n'ai pas dit que l'anglais est une langue facile.", char=args.char)) print( model.evaluateAndShowAttention("Je fais un blocage sur l'anglais.", char=args.char)) else: input_lang, output_lang, pairs = prepareData(args.train_file) print(random.choice(pairs)) if args.char: model = EncoderDecoder(args.hidden_size, input_lang.n_chars, output_lang.n_chars, args.drop, args.tfr, args.max_length, args.lr, args.simple, args.bidirectional, args.dot, False, 1) else: model = EncoderDecoder(args.hidden_size, input_lang.n_words, output_lang.n_words, args.drop, args.tfr, args.max_length, args.lr, args.simple, args.bidirectional, args.dot, args.multi, args.num_layers) model.trainIters(pairs, input_lang, output_lang, args.n_iters, print_every=args.print_every, plot_every=args.plot_every, char=args.char) model.save(args.output_dir) model.evaluatePairs(pairs, char=args.char)
def validate(model, dev_data, vocab_src, vocab_tgt, epoch, config, direction=None): model.eval() device = torch.device( "cpu") if config["device"] == "cpu" else torch.device("cuda:0") with torch.no_grad(): model_hypotheses = [] references = [] val_dl = DataLoader(dev_data, batch_size=config["batch_size_eval"], shuffle=False, num_workers=2) val_dl = BucketingParallelDataLoader(val_dl) val_kl = 0 for sentences_x, sentences_y in val_dl: if direction == None or direction == "xy": x_in, _, x_mask, x_len = create_batch(sentences_x, vocab_src, device) x_mask = x_mask.unsqueeze(1) else: x_in, _, x_mask, x_len = create_batch(sentences_y, vocab_src, device) x_mask = x_mask.unsqueeze(1) qz = model.inference(x_in, x_mask, x_len) z = qz.mean pz = torch.distributions.Normal(loc=model.prior_loc, scale=model.prior_scale).expand( qz.mean.size()) kl_loss = torch.distributions.kl.kl_divergence(qz, pz) kl_loss = kl_loss.sum(dim=1) val_kl += kl_loss.sum(dim=0) enc_output, enc_hidden = model.encode(x_in, x_len, z) dec_hidden = model.init_decoder(enc_output, enc_hidden, z) raw_hypothesis = beam_search(model.decoder, model.emb_tgt, model.generate_tm, enc_output, dec_hidden, x_mask, vocab_tgt.size(), vocab_tgt[SOS_TOKEN], vocab_tgt[EOS_TOKEN], vocab_tgt[PAD_TOKEN], config, z) hypothesis = batch_to_sentences(raw_hypothesis, vocab_tgt) model_hypotheses += hypothesis.tolist() if direction == None or direction == "xy": references += sentences_y.tolist() else: references += sentences_x.tolist() val_kl /= len(dev_data) save_hypotheses(model_hypotheses, epoch, config, direction) model_hypotheses, references = clean_sentences(model_hypotheses, references, config) bleu = compute_bleu(model_hypotheses, references, epoch, config, direction, kl=val_kl) return bleu
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( '--data_dir', default=None, type=str, required=True, help= 'The input data dir. Should contain the .tsv files (or other data files) for the task.' ) parser.add_argument( '--action_generator_model_type', default=None, type=str, required=True, choices=list(MODEL_CLASSES.keys()), help= 'Model type to use for initial action prediction, selected in the list: ' + ', '.join(MODEL_CLASSES.keys())) parser.add_argument( '--consequence_generator_model_type', default=None, type=str, required=True, choices=list(MODEL_CLASSES.keys()), help= 'Model type to use for consequence prediction, selected in the list: ' + ', '.join(MODEL_CLASSES.keys())) parser.add_argument( '--action_refiner_model_type', default=None, type=str, required=True, choices=list(MODEL_CLASSES.keys()), help= 'Model type to use for refined action prediction, selected in the list: ' + ', '.join(MODEL_CLASSES.keys())) parser.add_argument( '--action_classifier_model_type', default=None, type=str, required=True, choices=list(MODEL_CLASSES.keys()), help= 'Model type to use for action classification, selected in the list: ' + ', '.join(MODEL_CLASSES.keys())) parser.add_argument( '--consequence_classifier_model_type', default=None, type=str, required=True, choices=list(MODEL_CLASSES.keys()), help= 'Model type to use for consequence classification, selected in the list: ' + ', '.join(MODEL_CLASSES.keys())) parser.add_argument( '--action_generator_checkpoint', default=None, type=str, required=True, help='Path to pre-trained model used for initial action generation') parser.add_argument( '--consequence_generator_checkpoint', default=None, type=str, required=True, help='Path to pre-trained model used for consequence generation') parser.add_argument( '--action_refiner_checkpoint', default=None, type=str, required=True, help='Path to pre-trained model used for initial action generation') parser.add_argument( '--action_classifier_checkpoint', default=None, type=str, required=True, help='Path to pre-trained model used for action classification') parser.add_argument( '--consequence_classifier_checkpoint', default=None, type=str, required=True, help='Path to pre-trained model used for consequence classification') parser.add_argument( '--split_name', default=None, type=str, required=True, choices=SPLITS, help='The name of the data split used to train / evaluate the model: ' + ', '.join(SPLITS)) parser.add_argument( '--output_dir', default=None, type=str, required=True, help= 'The root output directory where the model predictions and checkpoints will be written.' ) ## Generation parameters parser.add_argument( '--max_gen_length', default=60, type=int, help='The maximum length of the sequence to be generated.') parser.add_argument( '--temperature', default=1.0, type=float, help='The value used to module the next token probabilities.') parser.add_argument( '--k', default=0, type=int, help= 'The number of highest probability vocabulary tokens to keep for top-k-filtering.' ) parser.add_argument( '--p', default=0, type=float, help= 'If set to float < 1, only the most probable tokens with probabilities that add up to ' 'top_p or higher are kept for generation.') parser.add_argument('--num_beams', default=0, type=int, required=False, help='beams for beam search') parser.add_argument( '--do_sample', action='store_true', help= 'Whether to generate predictions via sampling; if off, decoding is done greedily.' ) parser.add_argument( '--sc101_action_embeddings_path', default=None, type=str, help= 'Path to the file containing the Social-Chemistry-101 action embeddings.' ) parser.add_argument( '--num_actions', default=0, type=int, required=False, help= 'number of actions to ge generated for a single story prefix prior to ranking' ) parser.add_argument( '--predict_consequences', action='store_true', help= 'Whether to use consequences when ranking predicted action alternatives.' ) ## Other parameters parser.add_argument( '--config_name', default='', type=str, help='Pretrained config name or path if not the same as model_name') parser.add_argument( '--tokenizer_name', default='', type=str, help='Pretrained tokenizer name or path if not the same as model_name') parser.add_argument( '--cache_dir', default='', type=str, help= 'The cache directory where do you want to store the pre-trained models downloaded from s3' ) parser.add_argument( '--max_seq_length', default=128, type=int, help= 'The maximum total input sequence length after tokenization. Sequences longer ' 'than this will be truncated, sequences shorter will be padded.') parser.add_argument( '--do_lower_case', action='store_true', help='Set this flag if you are using an uncased model.') parser.add_argument('--data_cache_dir', default=None, type=str, help='The root directory for caching features.') parser.add_argument('--per_gpu_eval_batch_size', default=8, type=int, help='Batch size per GPU/CPU for evaluation.') parser.add_argument( '--eval_all_checkpoints', action='store_true', help= 'Evaluate all checkpoints starting with the same prefix as model_name ending and ending ' 'with step number') parser.add_argument('--no_cuda', action='store_true', help='Avoid using CUDA when available') parser.add_argument('--overwrite_output_dir', action='store_true', help='Overwrite the content of the output directory') parser.add_argument( '--overwrite_cache', action='store_true', help='Overwrite the cached training and evaluation sets') parser.add_argument('--seed', type=int, default=42, help='random seed for initialization') parser.add_argument( '--fp16', action='store_true', help= 'Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit' ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= 'For fp16: Apex AMP optimization level selected in [\'O0\', \'O1\', \'O2\', and \'O3\'].' 'See details at https://nvidia.github.io/apex/amp.html') parser.add_argument('--local_rank', type=int, default=-1, help='For distributed training: local_rank') parser.add_argument('--server_ip', type=str, default='', help='For distant debugging.') parser.add_argument('--server_port', type=str, default='', help='For distant debugging.') args = parser.parse_args() # Check if directories need to be created args.original_data_dir = args.data_dir if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Setup distant debugging, if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logging.info('Waiting for debugger attach ...') ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( 'Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s', args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Generate with refinement logger.info('Generating actions through iterative refinement:') initial_actions, refined_actions = action_refinement_with_ranking( args, 'test') logger.info('Self-BLEU between initial and refined action predictions:') logging.info(compute_bleu(initial_actions, refined_actions)) logger.info('***** Experiment finished *****')
def eval_bleu( train_loader: d.BatchedIterator, valid_loader: d.BatchedIterator, model: nn.Module, en_vocab: Vocabulary, fr_vocab: Vocabulary, device: str, multi_gpu: bool, eval_fast: bool, output_file: str, ) -> None: model = model.to(device) if output_file is not None: output_file = open(output_file, 'w') if multi_gpu and device == 'cuda': print('Using multi gpu training') model = torch.nn.DataParallel(model, device_ids=[0, 1]).cuda() bleus = [] count = 0 with tqdm(train_loader, total=len(train_loader)) as pbar: for i, data in enumerate(pbar): if i == 0: continue src, src_lengths = data.src trg, trg_lengths = data.trg if eval_fast: predicted = model.generate_max(src, src_lengths, 100, device) else: predicted = model.slow_generate(src, src_lengths, 100, device) # predicted = (torch.Tensor(src.size(0), 100).uniform_() * (len(fr_vocab) - 1)).long() # predicted = predicted * # predicted = model.generate_beam(src, src_lengths, 100, 5, device) pred_arr = utils.torchtext_convert_to_str(predicted.cpu().numpy(), fr_vocab)[0] out_arr = utils.torchtext_convert_to_str(trg.cpu().numpy(), fr_vocab)[0] pred_slim_arr = utils.get_raw_sentence(pred_arr) out_slim_arr = utils.get_raw_sentence(out_arr) curr_bleu = utils.compute_bleu(pred_slim_arr, out_slim_arr) bleus.append(curr_bleu) if output_file is not None: src_arr = utils.torchtext_convert_to_str(src.cpu().numpy(), en_vocab)[0] src_slim_arr = utils.get_raw_sentence(src_arr) output = ' '.join(pred_slim_arr) actual_out = ' '.join(out_slim_arr) src = ' '.join(src_slim_arr) entry_str =''' {DELIM} BLEU = {BLEU} src = {src} target = {target} predicted = {pred} ''' entry_str = entry_str.format( DELIM=utils.create_entry_delim(), BLEU=curr_bleu * 100, src=src, target=actual_out, pred=output, ) output_file.write(entry_str) count += 1 pbar.set_postfix( curr_bleu=curr_bleu * 100, avg_bleu=(sum(bleus) / len(bleus) * 100) ) pbar.refresh() if output_file is not None: output_file.write( utils.create_entry_delim() + "\n" ) output_file.write( 'Average BLEU: {}\n'.format((sum(bleus) / len(bleus) * 100)) ) output_file.close()