Ejemplo n.º 1
0
def _benno_granularity(plag_spans, detected_spans):
    '''
    Granularity is defined in the paper -- essentially trying to measure
    how fine-grained a given detected_span is. 
    '''
    if len(detected_spans) == 0:
        return 1.0

    util = BaseUtility()
    # The S_R defined in the paper: set of plag_spans that overlap
    # some detected span
    detected_overlaps = []

    # The C_s defined in the paper: set of detected_spans that overlap
    # plag_span s
    # actual_overlaps[plag_span] = [list of detected_spans that overlap plag_span]
    actual_overlaps = {}

    for pspan in plag_spans:
        for dspan in detected_spans:
            if util.overlap(pspan, dspan) > 0:
                detected_overlaps.append(pspan)
                actual_overlaps.setdefault(tuple(pspan), []).append(dspan)

    gran_sum = 0.0
    for d_overlap in detected_overlaps:
        gran_sum += len(actual_overlaps[tuple(d_overlap)])

    if len(detected_overlaps) == 0:
        gran = 1.0
    else:
        gran = gran_sum / len(detected_overlaps)

    return gran
Ejemplo n.º 2
0
def _deprecated_benno_precision_and_recall(plag_spans, detected_spans):
    '''
    NOTE (nj) this is the way the competition specified precision and recall, but doesn't
    seem to make a ton of sense: when choosing a threshold, it's in our best interest to
    call everything non-plagiarized and get prec and recall values of 1.0 for all the non-plagiarized
    documents. We could create a corpus of docs containing plag., but that also doesn't seem to
    be in the spirit of detection in general.
    
    Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection"
    <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans
    <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag.
    '''
    util = BaseUtility()

    # Edge cases -- defined according to performance_measures script provided online
    # http://www.uni-weimar.de/medien/webis/research/events/pan-09/pan09-code/pan09-plagiarism-detection-performance-measures.py
    if len(plag_spans) == 0 and len(detected_spans) == 0:
        prec = 1.0
        recall = 1.0
    elif len(plag_spans) == 0 or len(detected_spans) == 0:
        prec = 0.0
        recall = 0.0
    else:
        recall_sum = 0.0

        # recall defined over all plag spans
        for pspan in plag_spans:
            pspan_len = float(pspan[1] - pspan[0])

            for dspan in detected_spans:
                temp_recall = util.overlap(pspan, dspan) / pspan_len
                recall_sum += temp_recall

        recall = recall_sum / len(plag_spans)

        prec_sum = 0.0
        for dspan in detected_spans:
            dspan_len = float(dspan[1] - dspan[0])

            for pspan in plag_spans:
                temp_prec = util.overlap(dspan, pspan) / dspan_len
                prec_sum += temp_prec

        prec = prec_sum / len(detected_spans)

    return prec, recall
Ejemplo n.º 3
0
def _benno_precision_and_recall(plag_spans, detected_spans):
    '''
    Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection"
    <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans
    <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag.

    Edge cases: if there are no plagiarized spans, there is no notion of recall. Returns None.
    If we detect nothing, then there is no notion of precision. Returns None.
    '''
    util = BaseUtility()

    if len(plag_spans) == 0:
        recall = None
    else:
        recall_sum = 0.0

        # recall defined over all plag spans
        for pspan in plag_spans:
            pspan_len = float(pspan[1] - pspan[0])

            for dspan in detected_spans:
                temp_recall = util.overlap(pspan, dspan) / pspan_len
                recall_sum += temp_recall

        recall = recall_sum / len(plag_spans)

    if len(detected_spans) == 0:
        prec = None
    else:
        prec_sum = 0.0

        for dspan in detected_spans:
            dspan_len = float(dspan[1] - dspan[0])

            for pspan in plag_spans:
                temp_prec = util.overlap(dspan, pspan) / dspan_len
                prec_sum += temp_prec

        prec = prec_sum / len(detected_spans)

    return prec, recall