'val - aBot', metric, xlabel='Epochs') if 'logProbsMean' in rankMetrics: logProbsMean = params['CELossCoeff'] * rankMetrics[ 'logProbsMean'] viz.linePlot(iterId, logProbsMean, 'aBotLoss', 'val CE') if params['trainMode'] == 'sl-abot': valLoss = logProbsMean viz.linePlot(iterId, valLoss, 'loss', 'val loss') if qBot: print("qBot Validation:") rankMetrics, roundMetrics = rankQBot(qBot, dataset, 'val') for metric, value in rankMetrics.items(): viz.linePlot(epochId, value, 'val - qBot', metric, xlabel='Epochs') viz.linePlot(iterId, epochId, 'iter x epoch', 'epochs') if 'logProbsMean' in rankMetrics: logProbsMean = params['CELossCoeff'] * rankMetrics[ 'logProbsMean'] viz.linePlot(iterId, logProbsMean, 'qBotLoss', 'val CE')
split, scoringFunction=utils.maskedNll) for metric, value in rankMetrics.items(): plotName = splitName + ' - ABot Rank' viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations') logging.info("Metric \"{}\": {}".format(metric, value)) # if params['evalModeList'] == 'QBotRank': if 'QBotRank' in params['evalModeList']: print("Performing QBotRank evaluation") rankMetrics, roundRanks = rankQBot(qBot, dataset, split, verbose=1) for metric, value in rankMetrics.items(): plotName = splitName + ' - QBot Rank' viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations') for r in range(numRounds + 1): for metric, value in roundRanks[r].items(): plotName = '[Iter %d] %s - QABots Rank Roundwise' % \ (iterId, splitName) viz.linePlot(r, value, plotName, metric, xlabel='Round') # if params['evalModeList'] == 'QABotsRank': if 'QABotsRank' in params['evalModeList']: print("Performing QABotsRank evaluation") outputPredFile = "data/visdial/visdial/output_predictions_rollout.h5" rankMetrics, roundRanks = rankQABots(qBot,
def run_dialog(params, dataset, split, aBot, qBot=None, beamSize=1): assert aBot is not None or (qBot is not None and aBot is not None),\ "Must provide either an A-Bot alone or both \ Q-Bot and A-Bot when generating dialog" rankMetrics, _ = rankQBot(qBot, dataset, 'val') old_split = dataset.split batchSize = dataset.batchSize numRounds = dataset.numRounds train_questions = set() dataset.split = 'train' dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) ind2word = dataset.ind2word to_str_gt = lambda w: str(" ".join([ind2word[x] for x in filter(lambda x:\ x>0,w.data.cpu().numpy())])) #.encode('utf-8','ignore') to_str_pred = lambda w, l: str(" ".join([ind2word[x] for x in list( filter( lambda x:x>0,w.data.cpu().numpy()))][:l.data.cpu()[0]])) #.encode('utf-8','ignore') for idx, batch in enumerate(dataloader): # append all questions in train in a set to calculate downstream metrics gtQuestions = Variable(batch['ques'], requires_grad=False) gtQuesLens = Variable(batch['ques_len'], requires_grad=False) if gtQuesLens.shape[0] < batchSize: break # iterate through the batch and add to dictionary for j in range(batchSize): for rnd in range(numRounds): question_str = to_str_pred(gtQuestions[j,rnd,:], gtQuesLens[j,rnd]) train_questions.add(question_str[8:]) print("train questions len:", len(train_questions)) dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) text = {'data': []} if '%s_img_fnames' % split not in dataset.data.keys(): print("[Error] Need coco directory and info as input " \ "to -cocoDir and -cocoInfo arguments for locating "\ "coco image files.") print("Exiting dialogDump without saving files.") return None getImgFileName = lambda x: dataset.data['%s_img_fnames' % split][x] getImgId = lambda x: int(getImgFileName(x)[:-4][-12:]) similarity_scores_mean = Variable(torch.zeros(numRounds)) norm_difference_scores_mean = Variable(torch.zeros(numRounds)) norm_scores_mean = Variable(torch.zeros(numRounds)) huber_scores_mean = Variable(torch.zeros(numRounds)) if params["useGPU"]: similarity_scores_mean = similarity_scores_mean.cuda() norm_difference_scores_mean = norm_difference_scores_mean.cuda() norm_scores_mean = norm_scores_mean.cuda() huber_scores_mean = huber_scores_mean.cuda() tot_idx = 0 output_dialog = True tot_examples = 0 unique_questions = 0 unique_questions_list = [] mutual_overlap_list = [] ent_1_list = [] ent_2_list = [] dist_1_list = [] dist_2_list = [] avg_precision_list = [] bleu_metric = 0 novel_questions = 0 oscillating_questions_cnt = 0 per_round_bleu = np.zeros(numRounds) ent_1 = 0 ent_2 = 0 for idx, batch in enumerate(dataloader): print("current batch:",idx) if idx > 3: output_dialog = False tot_idx = tot_idx + 1 imgIds = [getImgId(x) for x in batch['index']] dialog = [{'dialog': [], 'image_id': imgId} for imgId in imgIds] if dataset.useGPU: batch = {key: v.cuda() if hasattr(v, 'cuda')\ else v for key, v in batch.items()} image = Variable(batch['img_feat'], volatile=True) caption = Variable(batch['cap'], volatile=True) # ignoring the last batch if caption.size()[0] < batchSize: break captionLens = Variable(batch['cap_len'], volatile=True) if qBot is None: # A-Bot alone needs ground truth dialog gtQuestions = Variable(batch['ques'], volatile=True) gtQuesLens = Variable(batch['ques_len'], volatile=True) gtAnswers = Variable(batch['ans'], volatile=True) gtAnsLens = Variable(batch['ans_len'], volatile=True) if aBot: aBot.eval(), aBot.reset() aBot.observe( -1, image=image, caption=caption, captionLens=captionLens) if qBot: qBot.eval(), qBot.reset() qBot.observe(-1, caption=caption, captionLens=captionLens) questions = [] for j in range(batchSize): caption_str = to_str_gt(caption[j])[8:-6] dialog[j]['caption'] = caption_str past_dialog_hidden = None cur_dialog_hidden = None question_str_list = [[] for _ in range(batchSize)] gt_questions_str = [[] for _ in range(batchSize)] gtQuestions = Variable(batch['ques'], volatile=True) gtQuesLens = Variable(batch['ques_len'], volatile=True) gtAnswers = Variable(batch['ans'], volatile=True) gtAnsLens = Variable(batch['ans_len'], volatile=True) for round in range(numRounds): if aBot is not None and qBot is None: aBot.observe( round, ques=gtQuestions[:, round], quesLens=gtQuesLens[:, round]) aBot.observe( round, ans=gtAnswers[:, round], ansLens=gtAnsLens[:, round]) _ = aBot.forward() answers, ansLens = aBot.forwardDecode( inference='greedy', beamSize=beamSize) elif aBot is not None and qBot is not None: questions, quesLens = qBot.forwardDecode( beamSize=beamSize, inference='greedy') qBot.observe(round, ques=questions, quesLens=quesLens) aBot.observe(round, ques=questions, quesLens=quesLens) answers, ansLens = aBot.forwardDecode( beamSize=beamSize, inference='greedy') aBot.observe(round, ans=answers, ansLens=ansLens) qBot.observe(round, ans=answers, ansLens=ansLens) qBot.encoder() cur_dialog_hidden = qBot.encoder.dialogHiddens[-1][0] if round == 0: past_dialog_hidden = qBot.encoder.dialogHiddens[-1][0] cos = nn.CosineSimilarity(dim=1, eps=1e-6) similarity_scores = cos(cur_dialog_hidden, past_dialog_hidden) norm_difference_scores = torch.abs(torch.norm(cur_dialog_hidden, p=2, dim=1) - \ torch.norm(past_dialog_hidden,p=2,dim=1)) # calculate norm norm_scores = torch.norm(cur_dialog_hidden, p=2, dim=1) # calculate Huber Loss/ Difference at consecutive rounds with Huber Threshold = 0.1 threshold = 0.1 norm_differences = torch.abs(cur_dialog_hidden - past_dialog_hidden) l2_mask = norm_differences <= threshold norm_differences_new = 0.5 * norm_differences * norm_differences * (l2_mask == 1).float() l1_mask = norm_differences > threshold norm_differences_new = norm_differences_new + (((l1_mask == 1).float()) * (threshold * (norm_differences - (0.5 * threshold)))) huber_scores = torch.sum(norm_differences_new, dim=1) past_dialog_hidden = cur_dialog_hidden similarity_scores_mean[round] = similarity_scores_mean[round] + torch.mean(similarity_scores) norm_difference_scores_mean[round] = norm_difference_scores_mean[round] + torch.mean(norm_difference_scores) norm_scores_mean[round] = norm_scores_mean[round] + torch.mean(norm_scores) huber_scores_mean[round] = huber_scores_mean[round] + torch.mean(huber_scores) for j in range(batchSize): question_str = to_str_pred(questions[j], quesLens[j]) \ if qBot is not None else to_str_gt(gtQuestions[j]) gt_question_str = to_str_pred(gtQuestions[j,round,:], gtQuesLens[j,round]) gt_questions_str[j].append(gt_question_str[8:]) question_str_list[j].append(question_str[8:]) answer_str = to_str_pred(answers[j], ansLens[j]) if output_dialog: if round == 0: norm_score = float(norm_scores[j]) dialog[j]['dialog'].append({ "answer": answer_str[8:], "question": question_str[8:] + ":" + "N:%.2f" % norm_score + " " }) # "8:" for indexing out initial <START> else: similarity_score = float(similarity_scores[j]) norm_difference_score = float(norm_difference_scores[j]) norm_score = float(norm_scores[j]) huber_score = float(huber_scores[j]) dialog[j]['dialog'].append({ "answer": answer_str[8:], "question": question_str[8:] + ":" + "C:%.2f" % similarity_score + ";" + "NP:%.2f" % norm_difference_score + "H:%.2f" % huber_score + ";" + "N:%.2f" % norm_score + " " }) # "8:" for indexing out initial <START> per_round_bleu_batch = np.zeros((numRounds, batchSize)) for j in range(batchSize): # calculate bleu scores for each question str, with other questions as references to calculate # mutual overlap # also calculate round by round bleu score unigrams = [] bigrams = [] avg_bleu_score = 0 for rnd in range(numRounds): # Novel sentences metric cur_ques = question_str_list[j][rnd] gt_ques = gt_questions_str[j][rnd] if cur_ques not in train_questions: novel_questions += 1 # question oscillation metrics if rnd >= 2: if cur_ques == question_str_list[j][rnd-2]: oscillating_questions_cnt += 1 # bleu/mutual overlap metric references = [] for k in range(numRounds): if rnd != k: references.append(nltk.word_tokenize(question_str_list[j][k])) avg_bleu_score += sentence_bleu(references,nltk.word_tokenize(cur_ques)) per_round_bleu_batch[rnd][j] = sentence_bleu([nltk.word_tokenize(gt_ques)], nltk.word_tokenize(cur_ques)) unigrams.extend(list(ngrams(nltk.word_tokenize(cur_ques),1))) bigrams.extend(list(ngrams(nltk.word_tokenize(cur_ques),2))) avg_bleu_score /= float(numRounds) mutual_overlap_list.append(avg_bleu_score) bleu_metric += avg_bleu_score tot_tokens = len(unigrams) unigram_ctr = Counter(unigrams) bigram_ctr = Counter(bigrams) cur_ent_1 = get_entropy_ctr(unigram_ctr) ent_1 += cur_ent_1 ent_1_list.append(cur_ent_1) cur_ent_2 = get_entropy_ctr(bigram_ctr) ent_2 += cur_ent_2 ent_2_list.append(cur_ent_2) dist_1 = len(unigram_ctr.keys())/float(tot_tokens) dist_2 = len(bigram_ctr.keys())/float(tot_tokens) dist_1_list.append(dist_1) dist_2_list.append(dist_2) cur_unique_ques = len(set(question_str_list[j])) unique_questions += cur_unique_ques unique_questions_list.append(cur_unique_ques) # dialog[j]['caption'] += ':' + str(cur_unique_ques) tot_examples += batchSize if output_dialog: text['data'].extend(dialog) per_round_bleu += np.sum(per_round_bleu_batch,axis=1) avg_precision_list.extend(np.mean(per_round_bleu_batch,axis=0).tolist()) similarity_scores_mean = similarity_scores_mean * (1.0/tot_idx) norm_difference_scores_mean = norm_difference_scores_mean * (1.0/tot_idx) norm_scores_mean = norm_scores_mean *(1.0/tot_idx) huber_scores_mean = huber_scores_mean *(1.0/tot_idx) print("Mean Cos Similarity Scores:", similarity_scores_mean) print("Mean Difference of Norms Scores:", norm_difference_scores_mean) print("Mean Norm of Dialog State:", norm_scores_mean) print("Mean Huber Loss(Norm of differences):", huber_scores_mean) text['opts'] = { 'qbot': params['qstartFrom'], 'abot': params['startFrom'], 'backend': 'cudnn', 'beamLen': 20, 'beamSize': beamSize, 'decoder': params['decoder'], 'encoder': params['encoder'], 'gpuid': 0, 'imgNorm': params['imgNorm'], 'inputImg': params['inputImg'], 'inputJson': params['inputJson'], 'inputQues': params['inputQues'], 'loadPath': 'checkpoints/', 'maxThreads': 1, 'resultPath': 'dialog_output/results', 'sampleWords': 0, 'temperature': 1, 'useHistory': True, 'useIm': True, } unique_questions_arr = np.array(unique_questions_list) # converting metrics to numpy arrays similarity_scores_mean = similarity_scores_mean.cpu().data.numpy().tolist() norm_difference_scores_mean = norm_difference_scores_mean.cpu().data.numpy().tolist() norm_scores_mean = norm_scores_mean.cpu().data.numpy().tolist() huber_scores_mean = huber_scores_mean.cpu().data.numpy().tolist() bleu_metric /= float(tot_examples) ent_1 /= float(tot_examples) ent_2 /= float(tot_examples) per_round_bleu = per_round_bleu / float(tot_examples) print("tot unique questions: ", unique_questions) print("tot examples: ", tot_examples) print("avg unique questions per example: ", float(unique_questions) / tot_examples) print("std unique questions per example: ", float(np.std(unique_questions_arr))) print("Mutual Overlap (Bleu Metric): ", bleu_metric) print("tot novel questions: ", novel_questions) tot_questions = tot_examples * numRounds print("tot questions: ", tot_questions) print("avg novel questions: ", float(novel_questions)/float(tot_questions)) print("avg oscillating questions count", float(oscillating_questions_cnt)/tot_questions) print("osciallation questions count", oscillating_questions_cnt) dataset.split = old_split ret_metrics = {} ret_metrics["tot_unique_questions"] = unique_questions ret_metrics["tot_examples"] = tot_examples ret_metrics["mean_unique_questions"] = int((float(unique_questions) / tot_examples) * 100)/100.0 ret_metrics["std_unique_questions"] = int(float(np.std(unique_questions_arr)) * 100)/100.0 ret_metrics["similarity_scores_mean"] = similarity_scores_mean ret_metrics["norm_difference_scores_mean"] = norm_difference_scores_mean ret_metrics["norm_scores_mean"] = norm_scores_mean ret_metrics["huber_scores_mean"] = huber_scores_mean ret_metrics["mutual_overlap_score"] = bleu_metric ret_metrics["tot_novel_questions"] = novel_questions ret_metrics["avg_novel_questions"] = float(novel_questions)/float(tot_questions) ret_metrics["tot_questions"] = tot_questions ret_metrics['NLL'] = rankMetrics['logProbsMean'] ret_metrics["average_precision"] = np.mean(per_round_bleu) ret_metrics["per_round_precision"] = per_round_bleu.tolist() ret_metrics["ent_1"] = ent_1 ret_metrics["ent_2"] = ent_2 ret_metrics["dist_1"] = np.mean(dist_1_list) ret_metrics["dist_2"] = np.mean(dist_2_list) ret_metrics["average_precision_CI"] = (1.96 * np.std(avg_precision_list))/math.sqrt(len(avg_precision_list)) ret_metrics["ent_1_CI"] = (1.96 * np.std(ent_1_list))/math.sqrt(len(ent_1_list)) ret_metrics["ent_2_CI"] = (1.96 * np.std(ent_2_list))/math.sqrt(len(ent_2_list)) ret_metrics["unique_questions_CI"] = (1.96 * np.std(unique_questions_list))/math.sqrt(len(unique_questions_list)) ret_metrics["mutual_overlap_CI"] = (1.96 * np.std(mutual_overlap_list))/math.sqrt(len(mutual_overlap_list)) ret_metrics["dist_1_CI"] = (1.96 * np.std(dist_1_list))/math.sqrt(len(dist_1_list)) ret_metrics["dist_2_CI"] = (1.96 * np.std(dist_2_list))/math.sqrt(len(dist_2_list)) return text,ret_metrics
def main(params): aqmSetting = None if ("AQMBotRank" in params["evalModeList"] or "AQMdialog" in params["evalModeList"] or "AQMdemo" in params["evalModeList"]): aqmSetting = getAQMSetting(params) # setup dataloader dlparams = params.copy() dlparams['useIm'] = True dlparams['useHistory'] = True dlparams['numRounds'] = 10 splits = ['val', 'test'] dataset = VisDialDataset(dlparams, splits) # Transferring dataset parameters transfer = ['vocabSize', 'numOptions', 'numRounds'] for key in transfer: if hasattr(dataset, key): params[key] = getattr(dataset, key) if 'numRounds' not in params: params['numRounds'] = 10 # Always load checkpoint parameters with continue flag params['continue'] = True excludeParams = ['batchSize', 'visdomEnv', 'startFrom', 'qstartFrom', 'trainMode', \ 'evalModeList', 'inputImg', 'inputQues', 'inputJson', 'evalTitle', 'beamSize', \ 'enableVisdom', 'visdomServer', 'visdomServerPort', 'randomCaption', 'zeroCaption', 'numImg', 'numQ', 'numA', 'alpha', 'qbeamSize', 'gamma', 'delta', 'lambda', 'onlyGuesser', 'randQ', 'gen1Q', 'gtQ', 'randA', 'noHistory', 'slGuesser', 'resampleEveryDialog'] aBot = None qBot = None aqmBot = None # load aBot print('load aBot') if params['startFrom']: aBot, loadedParams, _ = utils.loadModel(params, 'abot', overwrite=True) assert aBot.encoder.vocabSize == dataset.vocabSize, "Vocab size mismatch!" for key in loadedParams: params[key] = loadedParams[key] aBot.eval() # Retaining certain dataloder parameters for key in excludeParams: params[key] = dlparams[key] print('load qBot') # load qBot if params['qstartFrom'] and not params['aqmstartFrom']: qBot, loadedParams, _ = utils.loadModel(params, 'qbot', overwrite=True) assert qBot.encoder.vocabSize == params[ 'vocabSize'], "Vocab size mismatch!" for key in loadedParams: params[key] = loadedParams[key] qBot.eval() # Retaining certain dataloder parameters for key in excludeParams: params[key] = dlparams[key] print('load AQM-Bot') # load aqmBot if params['aqmstartFrom']: # abot of AQM assert params['qstartFrom'] # qbot of AQM aqmBot, loadedParams, _ = utils.loadModel(params, 'AQM-qbot', overwrite=True) assert aqmBot.questioner.encoder.vocabSize == params[ 'vocabSize'], "Vocab size mismatch!" for key in loadedParams: params[key] = loadedParams[key] aqmBot.eval() # load qBot for key in excludeParams: params[key] = dlparams[key] aqmQ, loadedParams, _ = utils.loadModel(params, 'qbot', overwrite=True) assert aqmQ.encoder.vocabSize == params[ 'vocabSize'], "Vocab size mismatch!" for key in loadedParams: params[key] = loadedParams[key] aqmQ.eval() for key in excludeParams: params[key] = dlparams[key] aqmBot.setQuestioner(aqmQ) elif params['aqmQStartFrom']: from visdial.models.aqm_questioner import AQMQuestioner aqmBot = AQMQuestioner() aqmBot.eval() params['qstartFrom'] = params['aqmQStartFrom'] aqmQ, loadedParams, _ = utils.loadModel(params, 'qbot', overwrite=True) assert aqmQ.encoder.vocabSize == params[ 'vocabSize'], "Vocab size mismatch!" for key in loadedParams: params[key] = loadedParams[key] aqmQ.eval() for key in excludeParams: params[key] = dlparams[key] aqmBot.setQuestioner(aqmQ) params['startFrom'] = params['aqmAStartFrom'] aqmA, loadedParams, _ = utils.loadModel(params, 'abot', overwrite=True) assert aqmA.encoder.vocabSize == dataset.vocabSize, "Vocab size mismatch!" for key in loadedParams: params[key] = loadedParams[key] aqmA.eval() aqmBot.setAppAnswerer(aqmA) for key in excludeParams: params[key] = dlparams[key] pprint.pprint(params) #viz.addText(pprint.pformat(params, indent=4)) print("Running evaluation!") numRounds = params['numRounds'] if 'ckpt_iterid' in params: iterId = params['ckpt_iterid'] + 1 else: iterId = -1 if 'test' in splits: split = 'test' splitName = 'test - {}'.format(params['evalTitle']) else: split = 'val' splitName = 'full Val - {}'.format(params['evalTitle']) print("Using split %s" % split) dataset.split = split if 'ABotRank' in params['evalModeList']: if params['aqmstartFrom']: aBot = aqmBot.appAnswerer print('evaluating appBot of AQM') print("Performing ABotRank evaluation") rankMetrics = rankABot(aBot, dataset, split, scoringFunction=utils.maskedNll, expLowerLimit=params['expLowerLimit'], expUpperLimit=params['expUpperLimit']) print(rankMetrics) for metric, value in rankMetrics.items(): plotName = splitName + ' - ABot Rank' #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations') if 'QBotRank' in params['evalModeList']: print("Performing QBotRank evaluation") rankMetrics, roundRanks = rankQBot( qBot, dataset, split, expLowerLimit=params['expLowerLimit'], expUpperLimit=params['expUpperLimit'], verbose=1) for metric, value in rankMetrics.items(): plotName = splitName + ' - QBot Rank' #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations') for r in range(numRounds + 1): for metric, value in roundRanks[r].items(): plotName = '[Iter %d] %s - QABots Rank Roundwise' % \ (iterId, splitName) #viz.linePlot(r, value, plotName, metric, xlabel='Round') if 'QABotsRank' in params['evalModeList']: print("Performing QABotsRank evaluation") outputPredFile = "data/visdial/visdial/output_predictions_rollout.h5" rankMetrics, roundRanks = rankQABots( qBot, aBot, dataset, split, beamSize=params['beamSize'], expLowerLimit=params['expLowerLimit'], expUpperLimit=params['expUpperLimit'], zeroCaption=params['zeroCaption'], randomCaption=params['randomCaption'], numRounds=params['runRounds']) for metric, value in rankMetrics.items(): plotName = splitName + ' - QABots Rank' #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations') for r in range(numRounds + 1): for metric, value in roundRanks[r].items(): plotName = '[Iter %d] %s - QBot All Metrics vs Round'%\ (iterId, splitName) #viz.linePlot(r, value, plotName, metric, xlabel='Round') if 'AQMBotRank' in params['evalModeList']: print("Performing AQMBotRank evaluation") outputPredFile = "data/visdial/visdial/output_predictions_rollout.h5" rankMetrics, roundRanks = AQMRunner( aqmBot, aBot, dataset, split, beamSize=params['beamSize'], realQA=params['aqmRealQA'], saveLogs=params['saveLogs'], showQA=params['showQA'], expLowerLimit=params['expLowerLimit'], expUpperLimit=params['expUpperLimit'], selectedBatchIdxs=params['selectedBatchIdxs'], numRounds=params['runRounds'], lda=params['lambda'], onlyGuesser=params['onlyGuesser'], numQ=params['numQ'], qbeamSize=params['qbeamSize'], numImg=params['numImg'], alpha=params['alpha'], numA=params['numA'], randQ=params['randQ'], randA=params['randA'], zeroCaption=params['zeroCaption'], randomCaption=params['randomCaption'], gamma=params['gamma'], delta=params['delta'], gen1Q=params['gen1Q'], gtQ=params['gtQ'], noHistory=params['noHistory'], slGuesser=params['slGuesser'], resampleEveryDialog=params['resampleEveryDialog'], aqmSetting=aqmSetting, ).rankQuestioner() for metric, value in rankMetrics.items(): plotName = splitName + ' - QABots Rank' #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations') for r in range(numRounds + 1): for metric, value in roundRanks[r].items(): plotName = '[Iter %d] %s - QBot All Metrics vs Round'%\ (iterId, splitName) #viz.linePlot(r, value, plotName, metric, xlabel='Round') if 'dialog' in params['evalModeList']: print("Performing dialog generation...") split = 'test' outputFolder = "dialog_output/results" os.makedirs(outputFolder, exist_ok=True) outputPath = os.path.join(outputFolder, "results.json") dialogDump(params, dataset, split, aBot=aBot, qBot=qBot, expLowerLimit=params['expLowerLimit'], expUpperLimit=params['expUpperLimit'], beamSize=params['beamSize'], savePath=outputPath) if 'AQMdialog' in params['evalModeList']: print("Performing AQM dialog generation...") split = 'test' AQMRunner( aqmBot, aBot, dataset, split, beamSize=params['beamSize'], realQA=params['aqmRealQA'], saveLogs=params['saveLogs'], showQA=params['showQA'], expLowerLimit=params['expLowerLimit'], expUpperLimit=params['expUpperLimit'], selectedBatchIdxs=params['selectedBatchIdxs'], numRounds=params['runRounds'], lda=params['lambda'], onlyGuesser=params['onlyGuesser'], numQ=params['numQ'], qbeamSize=params['qbeamSize'], numImg=params['numImg'], alpha=params['alpha'], numA=params['numA'], randQ=params['randQ'], randA=params['randA'], zeroCaption=params['zeroCaption'], randomCaption=params['randomCaption'], gamma=params['gamma'], delta=params['delta'], gen1Q=params['gen1Q'], gtQ=params['gtQ'], noHistory=params['noHistory'], slGuesser=params['slGuesser'], resampleEveryDialog=params['resampleEveryDialog'], aqmSetting=aqmSetting, ).dialogDump(params)
if 'ABotRank' in params['evalModeList']: print("Performing ABotRank evaluation") rankMetrics = rankABot(aBot, dataset, split, scoringFunction=utils.maskedNll) print(rankMetrics) for metric, value in rankMetrics.items(): plotName = splitName + ' - ABot Rank' if 'QBotRank' in params['evalModeList']: print("Performing QBotRank evaluation") rankMetrics, roundRanks = rankQBot(qBot, dataset, split, verbose=1, exampleLimit=1400 * params['batchSize']) for metric, value in rankMetrics.items(): plotName = splitName + ' - QBot Rank' for r in range(numRounds + 1): for metric, value in roundRanks[r].items(): plotName = '[Iter %d] %s - QABots Rank Roundwise' % \ (iterId, splitName) if 'QABotsRank' in params['evalModeList']: print("Performing QABotsRank evaluation") outputPredFile = "/hhd/lvxinyu/visdial-pytorch/data/visdial/visdial/output_predictions_rollout.h5" rankMetrics, roundRanks = rankQABots(qBot, aBot,