Example #1
0
def read_res_file(res_fname, format):
    logging.info("Processing file: %s with search engine ranks" % res_fname)
    lineReader = ResFileReader(format)

    ir = defaultdict(list)
    for line_res in open(res_fname):
        qid, aid, relevant, ir_score = lineReader.read_line(line_res)  # process the line from the res file
        ir[qid].append( (relevant, ir_score) )

    # Sort based on the search engine score (largest to smallest).
    for qid, resList in ir.iteritems():
        ir[qid] = [rel for rel, score in sorted(resList, key = itemgetter(1), reverse = True)]
    return ir
Example #2
0
File: ev.py Project: pkroynitp/Iyas
def read_res_pred_files(res_fname,
                        pred_fname,
                        format,
                        verbose=True,
                        reranking_th=0.0,
                        ignore_noanswer=False):

    lineReader = ResFileReader(format)

    ir, svm = defaultdict(list), defaultdict(list)
    for line_res, line_pred in zip(open(res_fname), open(pred_fname)):
        # Process the line from the res file.
        qid, aid, relevant, ir_score = lineReader.read_line(line_res)
        pred_score = float(line_pred.strip())
        ir[qid].append((relevant, ir_score, aid))
        svm[qid].append((relevant, pred_score, aid))

    if verbose:
        analyze_file = open(pred_fname + ".analyzis", "w")

    # Remove questions that contain no correct answer
    if ignore_noanswer:
        for qid in ir.keys():
            candidates = ir[qid]
            if all(relevant == "false" for relevant, _, _ in candidates):
                del ir[qid]
                del svm[qid]

    for qid in ir:
        # Sort by IR score.
        ir_sorted = sorted(ir[qid], key=itemgetter(1), reverse=True)

        # Sort by SVM prediction score.
        svm_sorted = svm[qid]
        max_score = max([score for rel, score, aid in svm_sorted])
        if max_score >= reranking_th:
            svm_sorted = sorted(svm_sorted, key=itemgetter(1), reverse=True)

        if verbose:
            before = find_correct_answer_position(ir_sorted)
            after = find_correct_answer_position(svm_sorted)
            impr = analyze_reranking_improvement(before, after)
            analyze_file.write("%s %s\n" % (qid, str(impr)))

        ir[qid] = [rel for rel, score, aid in ir_sorted]
        svm[qid] = [rel for rel, score, aid in svm_sorted]

    if verbose:
        analyze_file.close()

    return ir, svm
Example #3
0
def read_res_pred_files(res_fname, pred_fname, format, verbose=True, 
                        reranking_th=0.0, 
                        ignore_noanswer=False):

	lineReader = ResFileReader(format)

	ir, svm = defaultdict(list), defaultdict(list)
	for line_res, line_pred in zip(open(res_fname), open(pred_fname)):
		# Process the line from the res file.
		qid, aid, relevant, ir_score = lineReader.read_line(line_res)  
		pred_score = float(line_pred.strip())
		ir[qid].append( (relevant, ir_score, aid) )
		svm[qid].append( (relevant, pred_score, aid) )

	if verbose:
		analyze_file = open(pred_fname + ".analyzis", "w")

	# Remove questions that contain no correct answer
	if ignore_noanswer:
		for qid in ir.keys():
			candidates = ir[qid]
			if all(relevant == "false" for relevant,_,_ in candidates):
				del ir[qid]
				del svm[qid]

	for qid in ir:
		# Sort by IR score.
		ir_sorted = sorted(ir[qid], key = itemgetter(1), reverse = True)
		
		# Sort by SVM prediction score.
		svm_sorted = svm[qid]
		max_score = max([score for rel, score, aid in svm_sorted])
		if max_score >= reranking_th:
			svm_sorted = sorted(svm_sorted, key = itemgetter(1), reverse = True)

		if verbose:
			before = find_correct_answer_position(ir_sorted)
			after = find_correct_answer_position(svm_sorted)
			impr = analyze_reranking_improvement(before, after)
			analyze_file.write("%s %s\n" % (qid, str(impr)))

		ir[qid] = [rel for rel, score, aid in ir_sorted]
		svm[qid] = [rel for rel, score, aid in svm_sorted]
	
	if verbose:
		analyze_file.close()

	return ir, svm
Example #4
0
def read_res_file(res_fname, format):
    logging.info("Processing file: %s with search engine ranks" % res_fname)
    lineReader = ResFileReader(format)

    ir = defaultdict(list)
    for line_res in open(res_fname):
        qid, aid, relevant, ir_score = lineReader.read_line(
            line_res)  # process the line from the res file
        ir[qid].append((relevant, ir_score))

    # Sort based on the search engine score (largest to smallest).
    for qid, resList in ir.items():
        ir[qid] = [
            rel
            for rel, score in sorted(resList, key=itemgetter(1), reverse=True)
        ]
    return ir
Example #5
0
def read_truth_file(infile, format, cut_truth_map_at_N=None):
    truth = dict()
    r = ResFileReader(format)
    prevqid = ""
    idcou = 0
    for line in open(infile, 'r'):
        qid, aid, relevant, ir_score = r.read_line_trec(line)
        if cut_truth_map_at_N == None:
            truth[(qid, aid)] = relevant
        else:
            if qid == prevqid:
                idcou += 1
            else:
                prevqid = qid
                idcou = 1
            if idcou <= cut_truth_map_at_N:
                truth[(qid, aid)] = relevant

    return truth
Example #6
0
def read_res_file(res_fname, format):
	logging.info("Processing file: %s with search engine ranks" % res_fname)
	lineReader = ResFileReader(format)

	ir = defaultdict(list)
	
	last_qid = None
	for line_res in open(res_fname):
		qid, aid, relevant, ir_score = lineReader.read_line(line_res)  # process the line from the res file
		ir[qid].append((relevant, ir_score))
		if last_qid is not None and qid != last_qid: pass
		    #print "read %d answer candidates for question(%s)" % (len(ir[last_qid]), last_qid)
		last_qid = qid
	
	# Sort based on the search engine score (largest to smallest).
	for qid, resList in ir.iteritems():
		ir[qid] = [rel for rel, score in sorted(resList, key = itemgetter(1), reverse = True)]
	
	return ir
Example #7
0
def read_res_file(res_fname, format):
    logging.info("Processing file: %s with search engine ranks" % res_fname)
    lineReader = ResFileReader(format)

    ir = defaultdict(list)

    last_qid = None
    for line_res in open(res_fname):
        qid, aid, relevant, ir_score = lineReader.read_line(
            line_res)  # process the line from the res file
        ir[qid].append((relevant, ir_score))
        if last_qid is not None and qid != last_qid: pass
        #print "read %d answer candidates for question(%s)" % (len(ir[last_qid]), last_qid)
        last_qid = qid

    # Sort based on the search engine score (largest to smallest).
    for qid, resList in ir.iteritems():
        ir[qid] = [
            rel
            for rel, score in sorted(resList, key=itemgetter(1), reverse=True)
        ]

    return ir
Example #8
0
def get_tp_fp_tn_fn(res_fname,
                    pred_fname,
                    ignore_noanswer=False,
                    ignore_allanswer=False):
    lineReader = ResFileReader()
    ir, svm = defaultdict(list), defaultdict(list)
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for line_res, line_pred in zip(open(res_fname), open(pred_fname)):
        # Process the line from the res file.
        qid, aid, relevant_string, ir_score = lineReader.read_line(line_res)
        relevant = (relevant_string == "true")
        pred_score = float(line_pred.strip())
        if (pred_score > 0.0) and relevant:
            tp += 1
        if (pred_score > 0.0) and not relevant:
            fp += 1
        if (pred_score <= 0.0) and not relevant:
            tn += 1
        if (pred_score <= 0.0) and relevant:
            fn += 1
    return tp, fp, tn, fn
Example #9
0
def read_res_pred_files(res_fname,
                        pred_fname,
                        format,
                        verbose=True,
                        reranking_th=-100.0,
                        ignore_noanswer=False,
                        truth_map=None):
    lineReader = ResFileReader(format)

    ir, svm = defaultdict(list), defaultdict(list)
    for line_res, line_pred in zip(open(res_fname), open(pred_fname)):
        # Process the line from the res file.
        qid, aid, relevant, ir_score = lineReader.read_line(line_res)
        if (qid, aid) in truth_map:
            if (relevant != truth_map[(qid, aid)]):
                #       print "%s, %s changed label from %s to %s" % (qid, aid, relevant, truth_map[(qid, aid)])
                relevant = truth_map[(qid, aid)]
        #else:
        #    print qid, aid, "not found"
            pred_score = float(line_pred.strip())
            ir[qid].append((relevant, ir_score, aid))
            svm[qid].append((relevant, pred_score, aid))
        #else:
        #    print qid, aid, "not found in the gold standard annotations"
    if verbose:
        analyze_file = open(pred_fname + ".analyzis", "w")
        info_file = open(pred_fname + ".correctpos", "w")
    print "Annotations for %d question read" % (len(ir))
    # Remove questions that contain no correct answer
    if ignore_noanswer:
        for qid in ir.keys():
            candidates = ir[qid]
            if all(relevant == "false"
                   for relevant, _, _ in candidates) or all(
                       relevant == "true" for relevant, _, _ in candidates):
                del ir[qid]
                del svm[qid]

    for qid in ir:
        # Sort by IR score.
        ir_sorted = sorted(ir[qid], key=itemgetter(1), reverse=True)

        # Sort by SVM prediction score.
        svm_sorted = svm[qid]
        max_score = max([score for rel, score, aid in svm_sorted])
        if max_score >= reranking_th:
            svm_sorted = sorted(svm_sorted, key=itemgetter(1), reverse=True)

        if verbose:
            before = find_correct_answer_position(ir_sorted)
            after = find_correct_answer_position(svm_sorted)
            impr = analyze_reranking_improvement(before, after)
            analyze_file.write("%s %s\n" % (qid, str(impr)))
            info_file.write("%s %s %s\n" % (qid, str(before), str(after)))

        ir[qid] = [rel for rel, score, aid in ir_sorted]
        svm[qid] = [rel for rel, score, aid in svm_sorted]

    if verbose:
        analyze_file.close()
        info_file.close()

    return ir, svm
Example #10
0
def read_res_pred_files(res_fname,
                        pred_fname,
                        format,
                        verbose=True,
                        reranking_th=0.0,
                        ignore_noanswer=False):

    lineReader = ResFileReader(format)
    lineReader_pred = ResFileReader(format)

    ir, svm = defaultdict(list), defaultdict(list)
    conf_matrix = {
        'true': {
            'true': 0,
            'false': 0
        },
        'false': {
            'true': 0,
            'false': 0
        }
    }
    lineNo = 0
    for line_res, line_pred in zip(open(res_fname), open(pred_fname)):
        lineNo = lineNo + 1
        # Process the line from the res file.
        qid, aid, relevant, ir_score = lineReader.read_line(line_res)
        pred_qid, pred_aid, pred_relevant, pred_score = lineReader_pred.read_line(
            line_pred)

        if (qid != pred_qid) or (aid != pred_aid):
            print('ERROR: ID mismatch on line ' + str(lineNo) + ':')
            print('in ' + res_fname + ' we have (' + qid + ',' + aid + '),')
            print('but in ' + pred_fname + ' we have (' + pred_qid + ',' +
                  pred_aid + ')')
            quit()

        if (relevant != 'true') and (relevant != 'false'):
            print('ERROR: wrong label on line ' + str(lineNo) + ' in ' +
                  res_fname + ': "' + relevant + '"')
            print('Allowed values are only "true" and "false"')
            quit()

        if (pred_relevant != 'true') and (pred_relevant != 'false'):
            print('ERROR: wrong label on line ' + str(lineNo) + ' in ' +
                  pred_fname + ': "' + pred_relevant + '"')
            print('Allowed values are only "true" and "false"')
            quit()

        ir[qid].append((relevant, ir_score, aid))
        svm[qid].append((relevant, pred_score, aid))
        conf_matrix[relevant][
            pred_relevant] = conf_matrix[relevant][pred_relevant] + 1

    if verbose:
        analyze_file = open(pred_fname + ".analysis", "w")
        info_file = open(pred_fname + ".correctpos", "w")

    # Remove questions that contain no correct answer
    if ignore_noanswer:
        for qid in list(ir.keys()):
            candidates = ir[qid]
            if all(relevant == "false" for relevant, _, _ in candidates):
                del ir[qid]
                del svm[qid]
    svm_last = defaultdict(list)
    for qid in ir:
        # Sort by IR score.
        ir_sorted = sorted(ir[qid], key=itemgetter(1), reverse=True)

        # Sort by SVM prediction score.
        svm_sorted = svm[qid]
        max_score = max([score for rel, score, aid in svm_sorted])
        if max_score >= reranking_th:
            svm_sorted = sorted(svm_sorted, key=itemgetter(1), reverse=True)

        if verbose:
            before = find_correct_answer_position(ir_sorted)
            after = find_correct_answer_position(svm_sorted)
            impr = analyze_reranking_improvement(before, after)
            analyze_file.write("%s %s\n" % (qid, str(impr)))
            info_file.write("%s %s %s\n" % (qid, str(before), str(after)))

        ir[qid] = [rel for rel, score, aid in ir_sorted]

        svm_last[qid] = [rel for rel, score, aid in svm_sorted]

    if verbose:
        analyze_file.close()
        info_file.close()

    return ir, svm_last, conf_matrix
Example #11
0
def read_res_pred_files(res_fname, pred_fname, format, verbose=True,
                        reranking_th=0.0,
                        ignore_noanswer=False):

    lineReader = ResFileReader(format)
    lineReader_pred = ResFileReader(format)

    ir, svm = defaultdict(list), defaultdict(list)
    conf_matrix = {'true' : {'true' : 0, 'false' : 0}, 'false' : {'true' : 0, 'false' : 0}}
    lineNo = 0
    for line_res, line_pred in zip(open(res_fname), open(pred_fname)):
        lineNo = lineNo + 1
        # Process the line from the res file.
        qid, aid, relevant, ir_score = lineReader.read_line(line_res)
        pred_qid, pred_aid, pred_relevant, pred_score = lineReader_pred.read_line(line_pred)

        if (qid != pred_qid) or (aid != pred_aid):
            print('ERROR: ID mismatch on line ' + str(lineNo) + ':')
            print('in ' + res_fname + ' we have (' + qid + ',' + aid + '),')
            print('but in ' + pred_fname + ' we have (' + pred_qid + ',' + pred_aid + ')')
            quit()

        if (relevant != 'true') and (relevant != 'false'):
            print('ERROR: wrong label on line ' + str(lineNo) + ' in ' + res_fname + ': "' + relevant + '"')
            print('Allowed values are only "true" and "false"')
            quit()

        if (pred_relevant != 'true') and (pred_relevant != 'false'):
            print('ERROR: wrong label on line ' + str(lineNo) + ' in ' + pred_fname + ': "' + pred_relevant + '"')
            print('Allowed values are only "true" and "false"')
            quit()

        ir[qid].append( (relevant, ir_score, aid) )
        svm[qid].append( (relevant, pred_score, aid) )
        conf_matrix[relevant][pred_relevant] = conf_matrix[relevant][pred_relevant] + 1

    if verbose:
        analyze_file = open(pred_fname + ".analysis", "w")
        info_file = open(pred_fname + ".correctpos", "w")

    # Remove questions that contain no correct answer
    if ignore_noanswer:
        for qid in ir.keys():
            candidates = ir[qid]
            if all(relevant == "false" for relevant,_,_ in candidates):
                del ir[qid]
                del svm[qid]

    for qid in ir:
        # Sort by IR score.
        ir_sorted = sorted(ir[qid], key = itemgetter(1), reverse = True)

        # Sort by SVM prediction score.
        svm_sorted = svm[qid]
        max_score = max([score for rel, score, aid in svm_sorted])
        if max_score >= reranking_th:
            svm_sorted = sorted(svm_sorted, key = itemgetter(1), reverse = True)

        if verbose:
            before = find_correct_answer_position(ir_sorted)
            after = find_correct_answer_position(svm_sorted)
            impr = analyze_reranking_improvement(before, after)
            analyze_file.write("%s %s\n" % (qid, str(impr)))
            info_file.write("%s %s %s\n" % (qid, str(before), str(after)))

        ir[qid] = [rel for rel, score, aid in ir_sorted]
        svm[qid] = [rel for rel, score, aid in svm_sorted]

    if verbose:
        analyze_file.close()
        info_file.close()

    return ir, svm, conf_matrix