def read_res_file(res_fname, format): logging.info("Processing file: %s with search engine ranks" % res_fname) lineReader = ResFileReader(format) ir = defaultdict(list) for line_res in open(res_fname): qid, aid, relevant, ir_score = lineReader.read_line(line_res) # process the line from the res file ir[qid].append( (relevant, ir_score) ) # Sort based on the search engine score (largest to smallest). for qid, resList in ir.iteritems(): ir[qid] = [rel for rel, score in sorted(resList, key = itemgetter(1), reverse = True)] return ir
def read_res_pred_files(res_fname, pred_fname, format, verbose=True, reranking_th=0.0, ignore_noanswer=False): lineReader = ResFileReader(format) ir, svm = defaultdict(list), defaultdict(list) for line_res, line_pred in zip(open(res_fname), open(pred_fname)): # Process the line from the res file. qid, aid, relevant, ir_score = lineReader.read_line(line_res) pred_score = float(line_pred.strip()) ir[qid].append((relevant, ir_score, aid)) svm[qid].append((relevant, pred_score, aid)) if verbose: analyze_file = open(pred_fname + ".analyzis", "w") # Remove questions that contain no correct answer if ignore_noanswer: for qid in ir.keys(): candidates = ir[qid] if all(relevant == "false" for relevant, _, _ in candidates): del ir[qid] del svm[qid] for qid in ir: # Sort by IR score. ir_sorted = sorted(ir[qid], key=itemgetter(1), reverse=True) # Sort by SVM prediction score. svm_sorted = svm[qid] max_score = max([score for rel, score, aid in svm_sorted]) if max_score >= reranking_th: svm_sorted = sorted(svm_sorted, key=itemgetter(1), reverse=True) if verbose: before = find_correct_answer_position(ir_sorted) after = find_correct_answer_position(svm_sorted) impr = analyze_reranking_improvement(before, after) analyze_file.write("%s %s\n" % (qid, str(impr))) ir[qid] = [rel for rel, score, aid in ir_sorted] svm[qid] = [rel for rel, score, aid in svm_sorted] if verbose: analyze_file.close() return ir, svm
def read_res_pred_files(res_fname, pred_fname, format, verbose=True, reranking_th=0.0, ignore_noanswer=False): lineReader = ResFileReader(format) ir, svm = defaultdict(list), defaultdict(list) for line_res, line_pred in zip(open(res_fname), open(pred_fname)): # Process the line from the res file. qid, aid, relevant, ir_score = lineReader.read_line(line_res) pred_score = float(line_pred.strip()) ir[qid].append( (relevant, ir_score, aid) ) svm[qid].append( (relevant, pred_score, aid) ) if verbose: analyze_file = open(pred_fname + ".analyzis", "w") # Remove questions that contain no correct answer if ignore_noanswer: for qid in ir.keys(): candidates = ir[qid] if all(relevant == "false" for relevant,_,_ in candidates): del ir[qid] del svm[qid] for qid in ir: # Sort by IR score. ir_sorted = sorted(ir[qid], key = itemgetter(1), reverse = True) # Sort by SVM prediction score. svm_sorted = svm[qid] max_score = max([score for rel, score, aid in svm_sorted]) if max_score >= reranking_th: svm_sorted = sorted(svm_sorted, key = itemgetter(1), reverse = True) if verbose: before = find_correct_answer_position(ir_sorted) after = find_correct_answer_position(svm_sorted) impr = analyze_reranking_improvement(before, after) analyze_file.write("%s %s\n" % (qid, str(impr))) ir[qid] = [rel for rel, score, aid in ir_sorted] svm[qid] = [rel for rel, score, aid in svm_sorted] if verbose: analyze_file.close() return ir, svm
def read_res_file(res_fname, format): logging.info("Processing file: %s with search engine ranks" % res_fname) lineReader = ResFileReader(format) ir = defaultdict(list) for line_res in open(res_fname): qid, aid, relevant, ir_score = lineReader.read_line( line_res) # process the line from the res file ir[qid].append((relevant, ir_score)) # Sort based on the search engine score (largest to smallest). for qid, resList in ir.items(): ir[qid] = [ rel for rel, score in sorted(resList, key=itemgetter(1), reverse=True) ] return ir
def read_truth_file(infile, format, cut_truth_map_at_N=None): truth = dict() r = ResFileReader(format) prevqid = "" idcou = 0 for line in open(infile, 'r'): qid, aid, relevant, ir_score = r.read_line_trec(line) if cut_truth_map_at_N == None: truth[(qid, aid)] = relevant else: if qid == prevqid: idcou += 1 else: prevqid = qid idcou = 1 if idcou <= cut_truth_map_at_N: truth[(qid, aid)] = relevant return truth
def read_res_file(res_fname, format): logging.info("Processing file: %s with search engine ranks" % res_fname) lineReader = ResFileReader(format) ir = defaultdict(list) last_qid = None for line_res in open(res_fname): qid, aid, relevant, ir_score = lineReader.read_line(line_res) # process the line from the res file ir[qid].append((relevant, ir_score)) if last_qid is not None and qid != last_qid: pass #print "read %d answer candidates for question(%s)" % (len(ir[last_qid]), last_qid) last_qid = qid # Sort based on the search engine score (largest to smallest). for qid, resList in ir.iteritems(): ir[qid] = [rel for rel, score in sorted(resList, key = itemgetter(1), reverse = True)] return ir
def read_res_file(res_fname, format): logging.info("Processing file: %s with search engine ranks" % res_fname) lineReader = ResFileReader(format) ir = defaultdict(list) last_qid = None for line_res in open(res_fname): qid, aid, relevant, ir_score = lineReader.read_line( line_res) # process the line from the res file ir[qid].append((relevant, ir_score)) if last_qid is not None and qid != last_qid: pass #print "read %d answer candidates for question(%s)" % (len(ir[last_qid]), last_qid) last_qid = qid # Sort based on the search engine score (largest to smallest). for qid, resList in ir.iteritems(): ir[qid] = [ rel for rel, score in sorted(resList, key=itemgetter(1), reverse=True) ] return ir
def get_tp_fp_tn_fn(res_fname, pred_fname, ignore_noanswer=False, ignore_allanswer=False): lineReader = ResFileReader() ir, svm = defaultdict(list), defaultdict(list) tp = 0 fp = 0 tn = 0 fn = 0 for line_res, line_pred in zip(open(res_fname), open(pred_fname)): # Process the line from the res file. qid, aid, relevant_string, ir_score = lineReader.read_line(line_res) relevant = (relevant_string == "true") pred_score = float(line_pred.strip()) if (pred_score > 0.0) and relevant: tp += 1 if (pred_score > 0.0) and not relevant: fp += 1 if (pred_score <= 0.0) and not relevant: tn += 1 if (pred_score <= 0.0) and relevant: fn += 1 return tp, fp, tn, fn
def read_res_pred_files(res_fname, pred_fname, format, verbose=True, reranking_th=-100.0, ignore_noanswer=False, truth_map=None): lineReader = ResFileReader(format) ir, svm = defaultdict(list), defaultdict(list) for line_res, line_pred in zip(open(res_fname), open(pred_fname)): # Process the line from the res file. qid, aid, relevant, ir_score = lineReader.read_line(line_res) if (qid, aid) in truth_map: if (relevant != truth_map[(qid, aid)]): # print "%s, %s changed label from %s to %s" % (qid, aid, relevant, truth_map[(qid, aid)]) relevant = truth_map[(qid, aid)] #else: # print qid, aid, "not found" pred_score = float(line_pred.strip()) ir[qid].append((relevant, ir_score, aid)) svm[qid].append((relevant, pred_score, aid)) #else: # print qid, aid, "not found in the gold standard annotations" if verbose: analyze_file = open(pred_fname + ".analyzis", "w") info_file = open(pred_fname + ".correctpos", "w") print "Annotations for %d question read" % (len(ir)) # Remove questions that contain no correct answer if ignore_noanswer: for qid in ir.keys(): candidates = ir[qid] if all(relevant == "false" for relevant, _, _ in candidates) or all( relevant == "true" for relevant, _, _ in candidates): del ir[qid] del svm[qid] for qid in ir: # Sort by IR score. ir_sorted = sorted(ir[qid], key=itemgetter(1), reverse=True) # Sort by SVM prediction score. svm_sorted = svm[qid] max_score = max([score for rel, score, aid in svm_sorted]) if max_score >= reranking_th: svm_sorted = sorted(svm_sorted, key=itemgetter(1), reverse=True) if verbose: before = find_correct_answer_position(ir_sorted) after = find_correct_answer_position(svm_sorted) impr = analyze_reranking_improvement(before, after) analyze_file.write("%s %s\n" % (qid, str(impr))) info_file.write("%s %s %s\n" % (qid, str(before), str(after))) ir[qid] = [rel for rel, score, aid in ir_sorted] svm[qid] = [rel for rel, score, aid in svm_sorted] if verbose: analyze_file.close() info_file.close() return ir, svm
def read_res_pred_files(res_fname, pred_fname, format, verbose=True, reranking_th=0.0, ignore_noanswer=False): lineReader = ResFileReader(format) lineReader_pred = ResFileReader(format) ir, svm = defaultdict(list), defaultdict(list) conf_matrix = { 'true': { 'true': 0, 'false': 0 }, 'false': { 'true': 0, 'false': 0 } } lineNo = 0 for line_res, line_pred in zip(open(res_fname), open(pred_fname)): lineNo = lineNo + 1 # Process the line from the res file. qid, aid, relevant, ir_score = lineReader.read_line(line_res) pred_qid, pred_aid, pred_relevant, pred_score = lineReader_pred.read_line( line_pred) if (qid != pred_qid) or (aid != pred_aid): print('ERROR: ID mismatch on line ' + str(lineNo) + ':') print('in ' + res_fname + ' we have (' + qid + ',' + aid + '),') print('but in ' + pred_fname + ' we have (' + pred_qid + ',' + pred_aid + ')') quit() if (relevant != 'true') and (relevant != 'false'): print('ERROR: wrong label on line ' + str(lineNo) + ' in ' + res_fname + ': "' + relevant + '"') print('Allowed values are only "true" and "false"') quit() if (pred_relevant != 'true') and (pred_relevant != 'false'): print('ERROR: wrong label on line ' + str(lineNo) + ' in ' + pred_fname + ': "' + pred_relevant + '"') print('Allowed values are only "true" and "false"') quit() ir[qid].append((relevant, ir_score, aid)) svm[qid].append((relevant, pred_score, aid)) conf_matrix[relevant][ pred_relevant] = conf_matrix[relevant][pred_relevant] + 1 if verbose: analyze_file = open(pred_fname + ".analysis", "w") info_file = open(pred_fname + ".correctpos", "w") # Remove questions that contain no correct answer if ignore_noanswer: for qid in list(ir.keys()): candidates = ir[qid] if all(relevant == "false" for relevant, _, _ in candidates): del ir[qid] del svm[qid] svm_last = defaultdict(list) for qid in ir: # Sort by IR score. ir_sorted = sorted(ir[qid], key=itemgetter(1), reverse=True) # Sort by SVM prediction score. svm_sorted = svm[qid] max_score = max([score for rel, score, aid in svm_sorted]) if max_score >= reranking_th: svm_sorted = sorted(svm_sorted, key=itemgetter(1), reverse=True) if verbose: before = find_correct_answer_position(ir_sorted) after = find_correct_answer_position(svm_sorted) impr = analyze_reranking_improvement(before, after) analyze_file.write("%s %s\n" % (qid, str(impr))) info_file.write("%s %s %s\n" % (qid, str(before), str(after))) ir[qid] = [rel for rel, score, aid in ir_sorted] svm_last[qid] = [rel for rel, score, aid in svm_sorted] if verbose: analyze_file.close() info_file.close() return ir, svm_last, conf_matrix
def read_res_pred_files(res_fname, pred_fname, format, verbose=True, reranking_th=0.0, ignore_noanswer=False): lineReader = ResFileReader(format) lineReader_pred = ResFileReader(format) ir, svm = defaultdict(list), defaultdict(list) conf_matrix = {'true' : {'true' : 0, 'false' : 0}, 'false' : {'true' : 0, 'false' : 0}} lineNo = 0 for line_res, line_pred in zip(open(res_fname), open(pred_fname)): lineNo = lineNo + 1 # Process the line from the res file. qid, aid, relevant, ir_score = lineReader.read_line(line_res) pred_qid, pred_aid, pred_relevant, pred_score = lineReader_pred.read_line(line_pred) if (qid != pred_qid) or (aid != pred_aid): print('ERROR: ID mismatch on line ' + str(lineNo) + ':') print('in ' + res_fname + ' we have (' + qid + ',' + aid + '),') print('but in ' + pred_fname + ' we have (' + pred_qid + ',' + pred_aid + ')') quit() if (relevant != 'true') and (relevant != 'false'): print('ERROR: wrong label on line ' + str(lineNo) + ' in ' + res_fname + ': "' + relevant + '"') print('Allowed values are only "true" and "false"') quit() if (pred_relevant != 'true') and (pred_relevant != 'false'): print('ERROR: wrong label on line ' + str(lineNo) + ' in ' + pred_fname + ': "' + pred_relevant + '"') print('Allowed values are only "true" and "false"') quit() ir[qid].append( (relevant, ir_score, aid) ) svm[qid].append( (relevant, pred_score, aid) ) conf_matrix[relevant][pred_relevant] = conf_matrix[relevant][pred_relevant] + 1 if verbose: analyze_file = open(pred_fname + ".analysis", "w") info_file = open(pred_fname + ".correctpos", "w") # Remove questions that contain no correct answer if ignore_noanswer: for qid in ir.keys(): candidates = ir[qid] if all(relevant == "false" for relevant,_,_ in candidates): del ir[qid] del svm[qid] for qid in ir: # Sort by IR score. ir_sorted = sorted(ir[qid], key = itemgetter(1), reverse = True) # Sort by SVM prediction score. svm_sorted = svm[qid] max_score = max([score for rel, score, aid in svm_sorted]) if max_score >= reranking_th: svm_sorted = sorted(svm_sorted, key = itemgetter(1), reverse = True) if verbose: before = find_correct_answer_position(ir_sorted) after = find_correct_answer_position(svm_sorted) impr = analyze_reranking_improvement(before, after) analyze_file.write("%s %s\n" % (qid, str(impr))) info_file.write("%s %s %s\n" % (qid, str(before), str(after))) ir[qid] = [rel for rel, score, aid in ir_sorted] svm[qid] = [rel for rel, score, aid in svm_sorted] if verbose: analyze_file.close() info_file.close() return ir, svm, conf_matrix