def __init__(self, keras_model, dataset_provider, test_dataset_provider): self._model = keras_model self._dataset_provider = test_dataset_provider self._preprocessor = dataset_provider.caption_preprocessor self._metrics = [BLEU(4), METEOR(), CIDEr(), ROUGE(),SPICE()] #, ROUGE(),SPICE(), BERT() self._max_caption_length = 20 self._beam_size = 1
def calculate_metrics(id_to_prediction_path, id_to_references_path, model_name): # id_to_prediction = pickle.load(open(id_to_prediction_path, 'rb')) id_to_prediction = yaml.safe_load(open(id_to_prediction_path, 'r')) id_to_references = pickle.load(open(id_to_references_path, 'rb')) metrics_list = [BLEU(4), METEOR(), CIDEr(), ROUGE(), SPICE(), BERT()] metrics_value = {} for metric in metrics_list: metric_name_to_value = metric.calculate(id_to_prediction, id_to_references) metrics_value.update(metric_name_to_value) save_path = './results/Flickr30KfusedFashion1101/evaluation-results/' name = os.path.join( save_path, 'test-fused-metrics-{}-{}-{}-cloth_unbalance.yaml'.format( 3, 20, model_name)) write_yaml_file(metrics_value, name)
def validate(val_loader, model, word_map, args, start_epoch, beam_search_type='dbs', beam_size=3): model.eval() rev_word_map = {v: k for k, v in word_map.items()} with torch.no_grad(): references = { } # references (true captions) for calculating BLEU-4 score hypotheses = {} # hypotheses (predictions) prediction_save = { } # because each image may have multiple predictions, we use another dict to save all the captions for one images with the key as filename gt_save = {} image_id = 0 for i, (imgs, allcaps, caplens, img_filenames) in enumerate(val_loader): imgs = imgs.cuda() if beam_search_type == 'dbs': sentences = model.diverse_beam_search(imgs, beam_size, word_map) elif beam_search_type == 'beam_search': sentences, _ = model.beam_search(imgs, word_map, beam_size=beam_size) elif beam_search_type == 'greedy': sentences, _ = model.greedy_search(imgs, word_map) else: raise NotImplementedError( 'please specify the decoding method in [dbs, beam_search, greedy] in string type' ) # assert len(sentences) == batch_size img_filename = img_filenames[0] if img_filename not in prediction_save.keys(): prediction_save[img_filename] = [] gt_save[img_filename] = [] for idx, sentence in enumerate(sentences): if not image_id in hypotheses.keys(): hypotheses[image_id] = [] references[image_id] = [] hypotheses[image_id].append({'caption': sentence}) prediction_save[img_filename].append(sentence) for ref_item in allcaps[0]: # print(ref_item) enc_ref = [ w.item() for w in ref_item if w.item() not in { word_map['<start>'], word_map['<end>'], word_map['<pad>'], word_map['<unk>'] } ] # print(enc_ref) ref = ' '.join([ rev_word_map[enc_ref[i]] for i in range(len(enc_ref)) ]) if ref not in gt_save[img_filename]: gt_save[img_filename].append(ref) references[image_id].append({'caption': ref}) image_id += 1 # print(hypotheses) # print(references) results_dict = {} print("Calculating Evalaution Metric Scores......\n") avg_bleu_dict = BLEU().calculate(hypotheses, references) bleu4 = avg_bleu_dict['bleu_4'] avg_cider_dict = CIDEr().calculate(hypotheses, references) cider = avg_cider_dict['cider'] avg_bert_dict = BERT().calculate(hypotheses, references) bert = avg_bert_dict['bert'] avg_spice_dict = SPICE().calculate(hypotheses, references) avg_rouge_dict = ROUGE().calculate(hypotheses, references) avg_meteor_dict = METEOR().calculate(hypotheses, references) print( f'Evaluatioin results, BLEU-4: {bleu4}, Cider: {cider}, SPICE: {avg_spice_dict["spice"]}, ROUGE: {avg_rouge_dict["rouge"]}' ) results_dict.update(avg_bert_dict) results_dict.update(avg_bleu_dict) results_dict.update(avg_cider_dict) results_dict.update(avg_rouge_dict) results_dict.update(avg_spice_dict) results_dict.update(avg_meteor_dict) # write the predictions and ground truth to files prediction_filename = f'predictions_{args.dataset}_split_{args.test_split}_{beam_search_type}_{beam_size}_epoch{start_epoch}.yaml' gt_filename = f'reference_{args.dataset}_split_{args.test_split}_{beam_search_type}_{beam_size}.yaml' with open( os.path.join(args.save_path, args.encoder, args.dataset, prediction_filename), 'w') as f: yaml.safe_dump(prediction_save, f) f.close() with open( os.path.join(args.save_path, args.encoder, args.dataset, gt_filename), 'w') as f: yaml.safe_dump(gt_save, f) f.close() metrics_filename = f'metrics_{args.dataset}_split_{args.test_split}_{beam_search_type}_{beam_size}_epoch{start_epoch}.yaml' # write the evaluate metrics to files with open( os.path.join(args.save_path, args.encoder, args.dataset, metrics_filename), 'w') as f: yaml.safe_dump(results_dict, f) f.close()
def validate(val_loader, model, word_map, beam_size, epoch, beam_search_type='greedy'): model.eval() rev_word_map = {v: k for k, v in word_map.items()} with torch.no_grad(): references = { } # references (true captions) for calculating BLEU-4 score hypotheses = {} # hypotheses (predictions) prediction_save = { } # because each image may have multiple predictions, we use another dict to save all the captions for one images with the key as filename gt_save = {} image_id = 0 for i, (imgs, allcaps, caplens, img_filenames) in enumerate(val_loader): imgs = imgs.cuda() if beam_search_type == 'dbs': sentences = model.diverse_beam_search(imgs, beam_size, word_map) elif beam_search_type == 'beam_search': sentences, _ = model.beam_search(imgs, word_map, beam_size=beam_size) elif beam_search_type == 'greedy': sentences, _ = model.greedy_search(imgs, word_map) else: raise NotImplementedError( 'please specify the decoding method in [dbs, beam_search, greedy] in string type' ) # assert len(sentences) == batch_size img_filename = img_filenames[0] if img_filename not in prediction_save.keys(): prediction_save[img_filename] = [] gt_save[img_filename] = [] for idx, sentence in enumerate(sentences): if not image_id in hypotheses.keys(): hypotheses[image_id] = [] references[image_id] = [] hypotheses[image_id].append({'caption': sentence}) prediction_save[img_filename].append(sentence) for ref_item in allcaps[0]: # print(ref_item) enc_ref = [ w.item() for w in ref_item if w.item() not in { word_map['<start>'], word_map['<end>'], word_map['<pad>'], word_map['<unk>'] } ] ref = ' '.join([ rev_word_map[enc_ref[i]] for i in range(len(enc_ref)) ]) if ref not in gt_save[img_filename]: gt_save[img_filename].append(ref) references[image_id].append({'caption': ref}) image_id += 1 # print(hypotheses) print("Calculating Evalaution Metric Scores......\n") avg_bleu_dict = BLEU().calculate(hypotheses, references) bleu4 = avg_bleu_dict['bleu_4'] avg_cider_dict = CIDEr().calculate(hypotheses, references) cider = avg_cider_dict['cider'] avg_spice_dict = SPICE().calculate(hypotheses, references) avg_rouge_dict = ROUGE().calculate(hypotheses, references) print( f'Evaluatioin results at Epoch {epoch}, BLEU-4: {bleu4}, Cider: {cider}, SPICE: {avg_spice_dict["spice"]}, ROUGE: {avg_rouge_dict["rouge"]}' ) return bleu4, cider