Example #1
0
def process_run(args, run_file_name, annotation, description, thresh):
    '''
    compute scores and generate output files for a single run
    
    :returns dict: max_scores for this one run
    '''
    ## Generate confusion matrices from a run for each target_id
    ## and for each step of the confidence cutoff
    stats = build_confusion_matrix(
        os.path.join(args.run_dir, run_file_name) + '.gz',
        annotation, args.cutoff_step, args.unan_is_true, args.include_training,
        thresh=thresh,
        require_positives=args.require_positives,
        debug=args.debug)

    compile_and_average_performance_metrics(stats)

    max_scores = find_max_scores(stats)

    log(json.dumps(stats, indent=4, sort_keys=True))

    base_output_filepath = os.path.join(
        args.run_dir, 
        run_file_name + '-' + description)

    output_filepath = base_output_filepath + '.csv'
    write_performance_metrics(output_filepath, stats)

    ## Output a graph of the key performance statistics
    graph_filepath = base_output_filepath + '.png'
    write_graph(graph_filepath, stats)

    return max_scores
Example #2
0
def process_run(args, run_file_name, annotation, description, thresh):
    '''
    compute scores and generate output files for a single run
    
    :returns dict: max_scores for this one run
    '''
    ## Generate confusion matrices from a run for each target_id
    ## and for each step of the confidence cutoff
    stats = build_confusion_matrix(os.path.join(args.run_dir, run_file_name) +
                                   '.gz',
                                   annotation,
                                   args.cutoff_step,
                                   args.unan_is_true,
                                   args.include_training,
                                   thresh=thresh,
                                   require_positives=args.require_positives,
                                   debug=args.debug)

    compile_and_average_performance_metrics(stats)

    max_scores = find_max_scores(stats)

    log(json.dumps(stats, indent=4, sort_keys=True))

    base_output_filepath = os.path.join(args.run_dir,
                                        run_file_name + '-' + description)

    output_filepath = base_output_filepath + '.csv'
    write_performance_metrics(output_filepath, stats)

    ## Output a graph of the key performance statistics
    graph_filepath = base_output_filepath + '.png'
    write_graph(graph_filepath, stats)

    return max_scores
Example #3
0
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    if args.include_neutral:
        thresh = 0
    elif args.include_useful:
        thresh = 1
    else:
        thresh = 2

    ## Load in the annotation data
    annotation = load_annotation(args.annotation, thresh,
                                 args.min_len_clean_visible, reject,
                                 require_positives=args.require_positives
                                 )
    log( 'This assumes that all run file names end in .gz' )

    #import gc
    #from guppy import hpy
    #hp = hpy()
    
    run_count = 0
    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            continue
        
        if args.run_name_filter and not run_file.startswith(args.run_name_filter):
            continue

        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log( 'processing: %s.gz' % run_file_name )
        
        max_scores = process_run(args, run_file_name, annotation, description, thresh)

        ## split into team name and create stats file
        team_id, system_id = run_file_name.split('-')
        team_scores[team_id][system_id] = max_scores

        #gc.collect()
        #log(str(hp.heap()))

        run_count += 1
        #if run_count > 2:
        #    break

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)
Example #4
0
def score_confusion_matrix_OVERLAP(CM, DOCS_TPs, annotation, positives,
                                cutoff_step_size=50, unannotated_is_TN=False,
                                debug=False):
    '''
    construct OVERLAP_TPs by excluding from DOCS_TPs those assertions
    that do not overlap any string identified by an assessor
    '''
    cutoffs = range(0, 999, cutoff_step_size)

    OVERLAP_TPs = list()

    for rec in DOCS_TPs:
        (stream_id, target_id, conf, rating, contains_mention, 
         date_hour, slot_type, runs_equiv_id, start_byte, end_byte) = rec

        start_byte = int(start_byte)
        end_byte = int(end_byte)

        for true_equiv_id, equiv_class in annotation[stream_id][target_id][slot_type].items():
            offsets = equiv_class['stream_ids'][stream_id][1]
            overlaps = False
            for offset in offsets:
                assert isinstance(offset[0], int)
                assert isinstance(offset[1], int)
                if start_byte <= offset[1] and end_byte >= offset[0]:
                    overlaps = True
                    break
            log('(%d, %d) compared to offsets %r\n' % (start_byte, end_byte, offsets))

            if not overlaps:
                increment_CM(False, conf, cutoffs, CM, OVERLAPS, unannotated_is_TN)
                
            #log('found one!!  system equiv_id (%r) --> assessors equiv_id (%r)'
            #    % (runs_equiv_id, true_equiv_id))
            rec = list(rec)
            rec[7] = (runs_equiv_id, true_equiv_id)
            rec = tuple(rec)
            OVERLAP_TPs.append(rec)

            increment_CM(True, conf, cutoffs, CM, OVERLAPS, unannotated_is_TN)

    correct_FN(CM, OVERLAPS, positives)

    return CM, OVERLAP_TPs
Example #5
0
def load_annotation(path_to_annotation_file, include_useful, include_neutral, min_len_clean_visible, reject):
    '''
    Loads the annotation file into a dict
    
    path_to_annotation_file: string filesystem path to the annotation file
    include_useful: true to include docs marked useful and vital

    reject:  callable that returns boolean given a target_id
    '''
    annotation_file = csv.reader(open(path_to_annotation_file, 'r'), delimiter='\t')

    annotation = dict()
    for row in annotation_file:
       ## Skip comments
       if row[0][0] == "#":
           continue 
       
       stream_id = row[2]
       target_id = row[3]
       rating = int(row[5])

       if len(row) == 12:
           ## only the later versions of the truth data carried this
           ## twelve column for excluding documents with insufficient
           ## clean_visible to be judged.  We use a default cutoff of
           ## 100 bytes which means removing these counts below:
           #              (stream_id, target_id) pairs:  34921 above, and 15767 below 100 bytes of clean_visible
           # (assessor_id, stream_id, target_id) pairs:  47446 above, and 19948 below 100 bytes of clean_visible
           len_clean_visible = int(row[11])
           if len_clean_visible < min_len_clean_visible:
               log('excluding stream_id=%s for len(clean_visible)=%d' % (stream_id, len_clean_visible))
               continue

       if reject(target_id):
           log('excluding truth data for %s' % target_id)
           continue

       if include_neutral:
           thresh = 0
       elif include_useful:
           thresh = 1
       else:
           thresh = 2
       
       ## Add the stream_id and target_id to a hashed dictionary
       ## 0 means that its not vital 1 means that it is vital
              
       if (stream_id, target_id) in annotation:
           ## 2 means the annotators gave it a yes for vitality
           if rating < thresh:
                annotation[(stream_id, target_id)] = False
       else:
           ## store bool values in the annotation index
           annotation[(stream_id, target_id)] = rating >= thresh 

    num_true = sum(map(int, annotation.values()))
    log('loaded annotation to create a dict of %d (stream_id, target_id) pairs with %d True' % (len(annotation), num_true))
    if num_true == 0:
        sys.exit('found no true positives given the filters')
    return annotation
Example #6
0
def score_confusion_matrix_DATE_HOUR(CM, FILL_TPs, annotation, positives,
                                cutoff_step_size=50, unannotated_is_TN=False,
                                debug=False):
    '''
    construct DATE_HOUR_TPs by excluding from FILL_TPs those
    assertions that happen after the first one
    '''
    cutoffs = range(0, 999, cutoff_step_size)

    ## FILL_TPs are already in date_hour order, so we only have to
    ## count the first one for each equiv_id
    seen = set()
    DATE_HOUR_TPs = list()

    log('considering %d unique DATE_HOUR assertions' % len(FILL_TPs))
    for rec in FILL_TPs:
        (stream_id, target_id, conf, rating, contains_mention, 
         date_hour, slot_type, equiv_id, start_byte, end_byte) = rec

        if positives[DATE_HOUR].get(target_id, 0) == 0:
            log('ignoring assertion on entity for which no DATE_HOUR positives are known: %s' % target_id)
            continue

        if equiv_id in seen:
            increment_CM(False, conf=conf, cutoffs=cutoffs, CM=CM, mode=DATE_HOUR, 
                         target_id=target_id, 
                         unannotated_is_TN=unannotated_is_TN)
            continue

        ## this way of filtering is inadequate -- should be giving
        ## partial credit for finding slot fill late
        seen.add(equiv_id)
        DATE_HOUR_TPs.append(rec)

        increment_CM(True, conf=conf, cutoffs=cutoffs, CM=CM, mode=DATE_HOUR, 
                     target_id=target_id, 
                     unannotated_is_TN=unannotated_is_TN)

        for cutoff in CM[DATE_HOUR][target_id]:
            ## Then subtract the number of TP at each cutoffs 
            ## (since FN+TP==True things in annotation set)
            CM[DATE_HOUR][target_id][cutoff]['FN'] = \
                positives[DATE_HOUR][target_id] - CM[DATE_HOUR][target_id][cutoff]['TP']

    log('considering %d DATE_HOUR_TPs' % len(DATE_HOUR_TPs))
    return CM, DATE_HOUR_TPs
Example #7
0
def load_annotation(path_to_annotation_file, reject, slot_type_filter=None):
    '''
    Loads the SSF truth data from its JSON format on disk
    
    path_to_annotation_file: string file system path to the JSON annotation file
    
    reject:  callable that returns boolean given a target_id
    '''
    try:
        native_annotation = json.load(open(path_to_annotation_file))
    except Exception, exc:
        sys.exit( 'failed to open %r:\n%s' % (path_to_annotation_file, traceback.format_exc(exc)) )

    for target_id in native_annotation.keys():
        if reject(target_id):
            log('excluding truth data for %s' % target_id)
            native_annotation.pop(target_id)

    ## invert the annotation file to have a stream_id index pointing
    ## to target_ids point to slot_types pointing to slot fills,
    ## instead of the reverse
    annotation = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

    for target_id, slots in native_annotation.items():
        for slot_type, fills in slots.items():
            if slot_type_filter and slot_type != slot_type_filter:
                log('excluding truth data for %s' % slot_type)
                continue
            elif slot_type in ['SignificantOther', 'Children']:
                log('excluding truth data for %s because not part of official slot inventory.  To score this, use --slot-type' % slot_type)
                continue
Example #8
0
def score_confusion_matrix(path_to_run_file, annotation, cutoff_step, unannotated_is_TN, include_training, debug):
    '''
    This function generates the confusion matrix (number of true/false positives
    and true/false negatives.  
    
    path_to_run_file: str, a filesystem link to the run submission 
    annotation: dict, containing the annotation data
    cutoff_step: int, increment between cutoffs
    unannotated_is_TN: boolean, true to count unannotated as negatives
    include_training: boolean, true to include training documents
    
    returns a confusion matrix dictionary for each target_id 
    '''
    
    ## Open the run file    
    if path_to_run_file.endswith('.gz'):
        run_file = gzip.open(path_to_run_file, 'r')
    else:
        run_file = open(path_to_run_file, 'r')
        
    ## Create a dictionary containing the confusion matrix (CM)
    cutoffs = range(0, 999, cutoff_step)
    CM = dict()

    ## count the total number of assertions per entity
    num_assertions = {}

    ## Iterate through every row of the run
    for onerow in run_file:
        ## Skip Comments         
        if onerow.startswith('#') or len(onerow.strip()) == 0:
            continue

        row = onerow.split()
        stream_id = row[2]
        timestamp = int(stream_id.split('-')[0])
        target_id = row[3]
        score = int(float(row[4]))

        if target_id not in num_assertions:
            num_assertions[target_id] = {'total': 0,
                                       'in_TTR': 0,
                                       'in_ETR': 0,
                                       'in_annotation_set': 0}

        ## keep track of total number of assertions per entity
        num_assertions[target_id]['total'] += 1
        if timestamp <= END_OF_FEB_2012:
            num_assertions[target_id]['in_TTR'] += 1
        else:
            num_assertions[target_id]['in_ETR'] += 1

        ## If the entity has been seen yet create a confusion matrix for it
        if not target_id in CM:
            CM[target_id] = dict()
            for cutoff in cutoffs:
                CM[target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0)     

        if (not include_training) and (timestamp <= END_OF_FEB_2012):
            continue   
        
        in_annotation_set = (stream_id, target_id) in annotation

        if in_annotation_set:
            num_assertions[target_id]['in_annotation_set'] += 1

        
        ## In the annotation set and useful
        if in_annotation_set and annotation[(stream_id, target_id)]:            
            for cutoff in cutoffs:                
                if score > cutoff:
                    ## If above the cutoff: true-positive
                    CM[target_id][cutoff]['TP'] += 1                    
                   
        ## In the annotation set and non-useful                       
        elif in_annotation_set and not annotation[(stream_id, target_id)]:
            for cutoff in cutoffs:
                if score > cutoff:
                    ## Above the cutoff: false-positive
                    CM[target_id][cutoff]['FP'] += 1
                else:
                    ## Below the cutoff: true-negative
                    CM[target_id][cutoff]['TN'] += 1            
        ## Not in the annotation set so its a negative (if flag is true)
        elif unannotated_is_TN:
            for cutoff in cutoffs:
                if score > cutoff:
                    ## Above the cutoff: false-positive
                    CM[target_id][cutoff]['FP'] += 1
                else:
                    ## Below the cutoff: true-negative
                    CM[target_id][cutoff]['TN'] += 1    
    
    ## Correct FN for things in the annotation set that are NOT in the run
    ## First, calculate number of true things in the annotation set
    annotation_positives = defaultdict(int)
    for key in annotation:
        stream_id = key[0]
        timestamp = int(stream_id.split('-')[0])

        if (not include_training) and (timestamp <= 1325375999):
            continue 

        target_id = key[1]
        annotation_positives[target_id] += annotation[(stream_id,target_id)]
        
    for target_id in CM:
        for cutoff in CM[target_id]:
            ## Then subtract the number of TP at each cutoffs 
            ## (since FN+TP==True things in annotation set)
            CM[target_id][cutoff]['FN'] = annotation_positives[target_id] - CM[target_id][cutoff]['TP']

    if debug:
        log( 'showing assertion counts:' )
        log( json.dumps(num_assertions, indent=4, sort_keys=True) )

    return CM
Example #9
0
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    if args.include_neutral:
        thresh = 0
    elif args.include_useful:
        thresh = 1
    else:
        thresh = 2

    ## Load in the annotation data
    annotation = load_annotation(args.annotation, thresh,
                                 args.min_len_clean_visible, reject,
                                 require_positives=args.require_positives
                                 )
    log( 'This assumes that all run file names end in .gz' )
    annotationWriter = open('validassessments.csv', 'w')
    for ((stream_id, target_id) , is_pos) in annotation.iteritems():
        #'dde6ec  1332929640-c50cda6bee1564a599ae620d8918382e     http://en.wikipedia.org/wiki/Atacocha   1000    1       1332929640'
        timestamp = int(stream_id.split('-')[0])
        assessment = 1 if is_pos else 0
        annotationWriter.write('reserved\t%s\t%s\t1000\t%d\t%s\n'%(stream_id, target_id, assessment, timestamp))
    annotationWriter.close()
    #import gc
    #from guppy import hpy
    #hp = hpy()
    
    run_count = 0
    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            print 'ignoring %s because it does not end on *gz'%run_file
            continue
        
        if args.run_name_filter and not run_file.startswith(args.run_name_filter):
            print 'filename filter set to %s, but does not match %s'%(args.run_name_filter, run_file)
            continue

        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log( 'processing: %s.gz' % run_file_name )
        print( 'processing: %s.gz' % run_file_name )

        max_scores = process_run(args, run_file_name, annotation, description, thresh)

        ## split into team name and create stats file
        team_id, system_id = run_file_name.split('-')
        team_scores[team_id][system_id] = max_scores

        #gc.collect()
        #log(str(hp.heap()))

        run_count += 1
        #if run_count > 2:
        #    break

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)
Example #10
0
def build_confusion_matrix(path_to_run_file, annotation, cutoff_step, unannotated_is_TN, include_training, debug, thresh=2, require_positives=False):
    '''
    This function generates the confusion matrix (number of true/false positives
    and true/false negatives.  
    
    path_to_run_file: str, a filesystem link to the run submission 
    annotation: dict, containing the annotation data
    cutoff_step: int, increment between cutoffs
    unannotated_is_TN: boolean, true to count unannotated as negatives
    include_training: boolean, true to include training documents
    
    returns a confusion matrix dictionary for each target_id 
    '''
    
    ## Open the run file    
    if path_to_run_file.endswith('.gz'):
        run_file = gzip.open(path_to_run_file, 'r')
    else:
        run_file = open(path_to_run_file, 'r')
        
    ## Create a dictionary containing the confusion matrix (CM)
    cutoffs = range(0, 999, cutoff_step)
    CM = dict()

    ## count the total number of assertions per entity
    num_assertions = {}

    num_positives = defaultdict(int)
    for (stream_id, target_id), is_positive in annotation.items():
        ## compute total counts of number of positives for each target_id
        if is_positive:
            num_positives[target_id] += 1

        ## make sure that the confusion matrix has entries for all entities
        if target_id not in CM:
            CM[target_id] = dict()
            for cutoff in cutoffs:
                CM[target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0)     

    ## Iterate through every row of the run and construct a
    ## de-duplicated run summary
    run_set = dict()
    for onerow in run_file:
        ## Skip Comments         
        if onerow.startswith('#') or len(onerow.strip()) == 0:
            continue

        row = onerow.split()
        stream_id = row[2]
        timestamp = int(stream_id.split('-')[0])
        target_id = row[3]
        conf = int(float(row[4]))
        assert 0 < conf <= 1000
        row[4] = conf

        rating = int(row[5])
        assert -1 <= rating <= 2
        row[5] = rating

        #log('ratings:  %r <?> %r' % (rating, thresh))
        if rating < thresh:
            log('ignoring assertion below the rating threshold: %r < %r' % (rating, thresh))
            continue

        if require_positives and num_positives.get(target_id, 0) == 0:
            log('ignoring assertion on entity for which no CCR positives are known: %s' % target_id)
            continue

        assertion_key = (stream_id, target_id)
        if assertion_key in run_set:
            other_row = run_set[assertion_key]
            if other_row[4] > conf:
                log('ignoring a duplicate row with lower conf: %d > %d'
                    % (other_row[4], conf))
                continue

            if other_row[4] == conf:
                ## compare rating level
                if other_row[5] != rating:
                    log('same conf, different rating:\n%r\n%r\ntaking higher rating' % (row, other_row))
                    ## accept higher rating
                    if other_row[5] > rating:
                        continue

        #log('got a row: %r' % (row,))
        run_set[assertion_key] = row

    log('considering %d assertions' % len(run_set))
    run_set = run_set.values()
    while run_set:
        row = run_set.pop()

        stream_id = row[2]
        timestamp = int(stream_id.split('-')[0])
        target_id = row[3]
        conf = row[4]
        rating = row[5]

        if target_id not in num_assertions:
            num_assertions[target_id] = {'total': 0,
                                       'in_TTR': 0,
                                       'in_ETR': 0,
                                       'in_annotation_set': 0}

        ## keep track of total number of assertions per entity
        num_assertions[target_id]['total'] += 1
        if timestamp <= END_OF_FEB_2012:
            num_assertions[target_id]['in_TTR'] += 1
        else:
            num_assertions[target_id]['in_ETR'] += 1

        if (not include_training) and (timestamp <= END_OF_FEB_2012):
            continue   
        
        in_annotation_set = (stream_id, target_id) in annotation

        if in_annotation_set:
            num_assertions[target_id]['in_annotation_set'] += 1

        
        ## In the annotation set and useful
        if in_annotation_set and annotation[(stream_id, target_id)]:            
            for cutoff in cutoffs:                
                if conf > cutoff:
                    ## If above the cutoff: true-positive
                    CM[target_id][cutoff]['TP'] += 1                    
                   
        ## In the annotation set and non-useful                       
        elif in_annotation_set and not annotation[(stream_id, target_id)]:
            for cutoff in cutoffs:
                if conf > cutoff:
                    ## Above the cutoff: false-positive
                    CM[target_id][cutoff]['FP'] += 1
                else:
                    ## Below the cutoff: true-negative
                    CM[target_id][cutoff]['TN'] += 1            
        ## Not in the annotation set so its a negative (if flag is true)
        elif unannotated_is_TN:
            for cutoff in cutoffs:
                if conf > cutoff:
                    ## Above the cutoff: false-positive
                    CM[target_id][cutoff]['FP'] += 1
                else:
                    ## Below the cutoff: true-negative
                    CM[target_id][cutoff]['TN'] += 1    
    
    ## Correct FN for things in the annotation set that are NOT in the run
    ## First, calculate number of true things in the annotation set
    annotation_positives = defaultdict(int)
    for stream_id, target_id in annotation:
        timestamp = int(stream_id.split('-')[0])

        if (not include_training) and (timestamp <= END_OF_FEB_2012):
            continue 

        annotation_positives[target_id] += int(annotation[(stream_id,target_id)])
        
    for target_id in CM:
        for cutoff in CM[target_id]:
            ## Then subtract the number of TP at each cutoffs 
            ## (since FN+TP==True things in annotation set)
            #log('annotation_positives[%s] = %d' % (target_id, annotation_positives[target_id]))
            #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff]))

            CM[target_id][cutoff]['FN'] = annotation_positives[target_id] - CM[target_id][cutoff]['TP']

            #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff]))
            assert annotation_positives[target_id] >= CM[target_id][cutoff]['TP'], \
                "how did we get more TPs than available annotation_positives[target_id=%s] = %d >= %d = CM[target_id][cutoff=%f]['TP']" \
                % (target_id, annotation_positives[target_id], CM[target_id][cutoff]['TP'], cutoff)

    log( 'showing assertion counts:' )
    log( json.dumps(num_assertions, indent=4, sort_keys=True) )

    return CM
Example #11
0
def load_annotation(path_to_annotation_file, thresh, min_len_clean_visible, reject, require_positives=False, any_up=False):
    '''
    Loads the annotation file into a dict
    
    path_to_annotation_file: string filesystem path to the annotation file
    include_useful: true to include docs marked useful and vital

    :param min_len_clean_visible: minimum length of the clean_visible,
    which is in the 12 column of the expanded truth data file

    :param reject:  callable that returns boolean given a target_id

    :param require_positives: if set to True, reject any target entity
    for which no true positives exist.
    '''
    assert -1 <= thresh <= 2, thresh

    annotation_file = csv.reader(open(path_to_annotation_file, 'r'), delimiter='\t')

    annotation = dict()
    for row in annotation_file:
       ## Skip comments
       if row[0][0] == "#":
           continue 
       
       stream_id = row[2]
       target_id = row[3]
       rating = int(row[5])
       assert -1 <= rating <=2, rating

       if len(row) == 12:
           ## only the later versions of the truth data carried this
           ## twelve column for excluding documents with insufficient
           ## clean_visible to be judged.  We use a default cutoff of
           ## 100 bytes which means removing these counts below:
           #              (stream_id, target_id) pairs:  34921 above, and 15767 below 100 bytes of clean_visible
           # (assessor_id, stream_id, target_id) pairs:  47446 above, and 19948 below 100 bytes of clean_visible
           len_clean_visible = int(row[11])
           if len_clean_visible < min_len_clean_visible:
               log('excluding stream_id=%s for len(clean_visible)=%d' % (stream_id, len_clean_visible))
               continue

       if reject(target_id):
           log('excluding truth data for %s' % target_id)
           continue

       ## Add the stream_id and target_id to a hashed dictionary
       ## 0 means that its not vital 1 means that it is vital
              
       if (stream_id, target_id) in annotation:
           ## if rating is below threshold, then some assessor viewed
           ## it as not good enough, so be conservative and downgrade
           if not any_up and rating < thresh:
               ## default any_up=False means that if *any* assessor
               ## voted *against* the assertion, then *exclude* it
               annotation[(stream_id, target_id)] = False

           elif any_up and rating >= thresh:
               ## any_up means that if *any* assessor voted *for* the
               ## assertion, then *include* it
               annotation[(stream_id, target_id)] = True
       else:
           ## store bool values in the annotation index
           annotation[(stream_id, target_id)] = rating >= thresh 

    has_true = set()
    for (stream_id, target_id), is_true in annotation.items():
        if is_true:
            has_true.add(target_id)

    if require_positives:
        for stream_id, target_id in annotation.keys():
            if target_id not in has_true:
                log('rejecting %s for lack of any true positives -- because require_positives=True' % target_id)
                annotation.pop( (stream_id, target_id) )

    log('%d target_ids have at least one true positive' % len(has_true))

    num_true = sum(map(int, annotation.values()))
    log('loaded annotation to create a dict of %d (stream_id, target_id) pairs with %d True' % (len(annotation), num_true))
    if num_true == 0:
        sys.exit('found no true positives given the filters')
    return annotation
Example #12
0
        help='beginning of string of filename to filter runs that get considered')
    args = parser.parse_args()

    accepted_target_ids = set()
    if args.group or args.entity_type:
        if not args.topics_path:
            sys.exit('must specify --topics-path to use --group')
        targets = json.load(open(args.topics_path))['targets']
        for targ in targets:
            if targ['group'] == args.group or targ['entity_type'] == args.entity_type:
                accepted_target_ids.add(targ['target_id'])
    
    description = make_description(args)

    ## construct reject callable
    def reject(target_id):
        if args.reject_twitter and 'twitter.com' in target_id:
            return True
        if args.reject_wikipedia and 'wikipedia.org' in target_id:
            return True
        if args.group or args.entity_type:
            if target_id not in accepted_target_ids:
                return True  ## i.e. reject it
        return False

    score_all_runs(args, description, reject)

    elapsed = time.time() - start_time
    log('finished after %d seconds at at %r'
        % (elapsed, datetime.utcnow()))
Example #13
0
def build_confusion_matrix(path_to_run_file,
                           annotation,
                           cutoff_step,
                           unannotated_is_TN,
                           include_training,
                           debug,
                           thresh=2,
                           require_positives=0):
    '''
    This function generates the confusion matrix (number of true/false positives
    and true/false negatives.  
    
    path_to_run_file: str, a filesystem link to the run submission 
    annotation: dict, containing the annotation data from *after* the cutoff
    cutoff_step: int, increment between cutoffs
    unannotated_is_TN: boolean, true to count unannotated as negatives
    include_training: boolean, true to include training documents
    
    returns a confusion matrix dictionary for each target_id 
    '''

    ## Open the run file
    if path_to_run_file.endswith('.gz'):
        run_file = gzip.open(path_to_run_file, 'r')
    else:
        run_file = open(path_to_run_file, 'r')

    ## Create a dictionary containing the confusion matrix (CM)
    cutoffs = range(0, 999, cutoff_step)
    CM = dict()

    ## count the total number of assertions per entity
    num_assertions = {}

    num_positives = defaultdict(int)
    for (stream_id, target_id), is_positive in annotation.items():
        ## compute total counts of number of positives for each target_id
        if is_positive:
            num_positives[target_id] += 1

        ## make sure that the confusion matrix has entries for all entities
        if target_id not in CM:
            CM[target_id] = dict()
            for cutoff in cutoffs:
                CM[target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0)

    ## Iterate through every row of the run and construct a
    ## de-duplicated run summary
    run_set = dict()
    for onerow in run_file:
        ## Skip Comments
        if onerow.startswith('#') or len(onerow.strip()) == 0:
            continue

        row = onerow.split()
        stream_id = row[2]
        timestamp = int(stream_id.split('-')[0])
        target_id = row[3]
        conf = int(float(row[4]))
        assert 0 < conf <= 1000
        row[4] = conf

        rating = int(row[5])
        assert -1 <= rating <= 2
        row[5] = rating

        #log('ratings:  %r <?> %r' % (rating, thresh))
        if rating < thresh:
            log('ignoring assertion below the rating threshold: %r < %r' %
                (rating, thresh))
            continue

        if num_positives.get(target_id, 0) < require_positives:
            log('ignoring assertion on entity for which no CCR positives are known: %s'
                % target_id)
            continue

        assertion_key = (stream_id, target_id)
        if assertion_key in run_set:
            other_row = run_set[assertion_key]
            if other_row[4] > conf:
                log('ignoring a duplicate row with lower conf: %d > %d' %
                    (other_row[4], conf))
                continue

            if other_row[4] == conf:
                ## compare rating level
                if other_row[5] != rating:
                    log('same conf, different rating:\n%r\n%r\ntaking higher rating'
                        % (row, other_row))
                    ## accept higher rating
                    if other_row[5] > rating:
                        continue

        #log('got a row: %r' % (row,))
        run_set[assertion_key] = row

    log('considering %d assertions' % len(run_set))
    run_set = run_set.values()
    while run_set:
        row = run_set.pop()

        stream_id = row[2]
        timestamp = int(stream_id.split('-')[0])
        target_id = row[3]
        conf = row[4]
        rating = row[5]

        if target_id not in num_assertions:
            num_assertions[target_id] = {
                'total': 0,
                'in_ETR': 0,
                'in_annotation_set': 0
            }

        ## keep track of total number of assertions per entity
        num_assertions[target_id]['total'] += 1
        num_assertions[target_id]['in_ETR'] += 1

        in_annotation_set = (stream_id, target_id) in annotation

        if in_annotation_set:
            num_assertions[target_id]['in_annotation_set'] += 1

        ## In the annotation set and useful
        if in_annotation_set and annotation[(stream_id, target_id)]:
            for cutoff in cutoffs:
                if conf > cutoff:
                    ## If above the cutoff: true-positive
                    CM[target_id][cutoff]['TP'] += 1

        ## In the annotation set and non-useful
        elif in_annotation_set and not annotation[(stream_id, target_id)]:
            for cutoff in cutoffs:
                if conf > cutoff:
                    ## Above the cutoff: false-positive
                    CM[target_id][cutoff]['FP'] += 1
                else:
                    ## Below the cutoff: true-negative
                    CM[target_id][cutoff]['TN'] += 1
        ## Not in the annotation set so its a negative (if flag is true)
        elif unannotated_is_TN:
            for cutoff in cutoffs:
                if conf > cutoff:
                    ## Above the cutoff: false-positive
                    CM[target_id][cutoff]['FP'] += 1
                else:
                    ## Below the cutoff: true-negative
                    CM[target_id][cutoff]['TN'] += 1

    ## Correct FN for things in the annotation set that are NOT in the run
    ## First, calculate number of true things in the annotation set
    annotation_positives = defaultdict(int)
    for stream_id, target_id in annotation:
        timestamp = int(stream_id.split('-')[0])

        annotation_positives[target_id] += int(annotation[(stream_id,
                                                           target_id)])

    for target_id in CM:
        for cutoff in CM[target_id]:
            ## Then subtract the number of TP at each cutoffs
            ## (since FN+TP==True things in annotation set)
            #log('annotation_positives[%s] = %d' % (target_id, annotation_positives[target_id]))
            #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff]))

            CM[target_id][cutoff]['FN'] = annotation_positives[target_id] - CM[
                target_id][cutoff]['TP']

            #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff]))
            assert annotation_positives[target_id] >= CM[target_id][cutoff]['TP'], \
                "how did we get more TPs than available annotation_positives[target_id=%s] = %d >= %d = CM[target_id][cutoff=%f]['TP']" \
                % (target_id, annotation_positives[target_id], CM[target_id][cutoff]['TP'], cutoff)

    log('showing assertion counts:')
    log(json.dumps(num_assertions, indent=4, sort_keys=True))

    return CM
Example #14
0
def score_confusion_matrix_FILL(CM, OVERLAP_TPs, annotation, positives,
                           unannotated_is_TN=False,
                           cutoff_step_size=50, debug=False):
    '''
    construct FILL_TPs by excluding from OVERLAP_TPs those assertions
    that either:

       1) re-use an earlier (run)equiv_id that was not associated with
       the same (truth)equiv_id from the truth set

       1) fail to re-use an earlier (run)equiv_id that _was_
       associated with a (truth)equiv_id from the truth set

    '''
    cutoffs = range(0, 999, cutoff_step_size)

    FILL_TPs = dict()

    runs_to_true = dict()
    true_to_runs = dict()

    log('considering %d unique FILL assertions' % len(OVERLAP_TPs))
    for rec in OVERLAP_TPs:
        (stream_id, target_id, conf, rating, contains_mention, date_hour, 
         slot_type, (runs_equiv_id, true_equiv_id), start_byte, end_byte) = rec

        if positives[FILL].get(target_id, 0) == 0:
            log('ignoring assertion on entity for which no FILL positives are known: %s' % target_id)
            continue

        ## this is a tri-state variable
        FILL_correct = None

        if runs_equiv_id not in runs_to_true and true_equiv_id not in true_to_runs:
            runs_to_true[runs_equiv_id] = true_equiv_id
            true_to_runs[true_equiv_id] = runs_equiv_id

        else:

            ## check failure mode #1 in __doc__ string
            if runs_equiv_id in runs_to_true:
                ## run has previously asserted this equiv_id
                if true_equiv_id == runs_to_true[runs_equiv_id]:
                    FILL_correct = True
                else:
                    FILL_correct = False

            ## check failure mode #2 in __doc__ string
            if true_equiv_id in true_to_runs:
                if runs_equiv_id == true_to_runs[true_equiv_id]:
                    if FILL_correct is not False:
                        FILL_correct = True
                else:
                    FILL_correct = False

        if FILL_correct in [True, None]:

            assertion_key = (stream_id, target_id, slot_type, true_equiv_id)
            if assertion_key in FILL_TPs:
                other_row = FILL_TPs[assertion_key]
                if other_row[4] > conf:
                    log('ignoring a duplicate row with lower conf: %d > %d'
                        % (other_row[4], conf))
                    continue

            FILL_TPs[assertion_key] = rec

        increment_CM(FILL_correct, conf=conf, cutoffs=cutoffs, CM=CM, mode=FILL, 
                     target_id=target_id, 
                     unannotated_is_TN=unannotated_is_TN)

    correct_FN(CM, FILL, positives)

    FILL_TPs = FILL_TPs.values()

    return CM, FILL_TPs
Example #15
0
def score_confusion_matrix_DOCS(run_file_handle, annotation, positives,
                           cutoff_step_size=50, unannotated_is_TN=False, debug=False):
    '''
    read a run submission and generate a confusion matrix (number of
    true/false positives and true/false negatives) for DOCS mode
    evaluation.  Generate a confusion matrix for each cutoff step and
    each mode.
    
    run_file_handle: str, a filesystem link to the run submission 
    annotation: dict, containing the annotation data
    cutoff_step_size: int, increment between cutoffs
    unannotated_is_TN: boolean, true to count unannotated as negatives
    
    returns a confusion matrix dictionary for each target_id 
    '''
    ## Create a dictionary containing the confusion matrix (CM)
    cutoffs = range(0, 999, cutoff_step_size)

    def init_confusion_matrix():
        return dict(TP=0, FP=0, FN=0, TN=0)

    ## confusion matrix is mode-->target_id-->cutoff-->2-by-2 matrix
    CM = {mode: defaultdict(lambda: defaultdict(init_confusion_matrix))
          for mode in MODES}

    for stream_id in annotation:
        for target_id in annotation[stream_id]:
            for mode in MODES:
                ## make sure that the confusion matrix has entries for all entities
                if target_id not in CM[mode]:
                    CM[mode][target_id] = dict()
                    for cutoff in cutoffs:
                        CM[mode][target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0)

    ## count the total number of assertions per entity
    num_assertions = {}

    ## keep assertions that are in the annotation set, because this is
    ## much smaller than the entire run submission.  We will pass this
    ## to the four evaluation steps beyond DOCS.
    DOCS_TPs = list()

    ## Iterate through every row of the run and construct a
    ## de-duplicated run summary
    run_set = dict()
    for assertion_key, row in assertions(run_file_handle):
        conf = row[4]

        stream_id, target_id, slot_type = assertion_key
        if positives[DOCS].get(target_id, 0) == 0:
            #log('ignoring assertion on entity for which no DOCS positives are known: %s' % target_id)
            continue

        if assertion_key in run_set:
            other_row = run_set[assertion_key]
            if other_row[4] > conf:
                log('ignoring a duplicate row with lower conf: %d > %d'
                    % (other_row[4], conf))
                continue

        #log('got a row: %r' % (row,))
        run_set[assertion_key] = row

    log('considering %d unique DOCS assertions' % len(run_set))
    for row in run_set.values():

        stream_id = row[2]
        timestamp = int(stream_id.split('-')[0])
        target_id = row[3]
        conf = row[4]
        rating = row[5]

        contains_mention = int(row[6])
        date_hour = row[7]
        slot_type = row[8]
        equiv_id = row[9]
        start_byte, end_byte = row[10].split('-')
        start_byte = int(start_byte)
        end_byte = int(end_byte)


        if target_id not in num_assertions:
            num_assertions[target_id] = {'total': 0,
                                         'is_annotated_TP': 0}

        ## keep track of total number of assertions per entity
        num_assertions[target_id]['total'] += 1
        
        ## all modes start with DOCS, so is_annotated_TP means that
        ## the system has a DOCS-TP above some conf threshold
        is_annotated_TP = False
        if stream_id in annotation:
            if target_id in annotation[stream_id]:
                if slot_type in annotation[stream_id][target_id]:
                    is_annotated_TP = True
                    rec = (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, equiv_id, start_byte, end_byte)
                    DOCS_TPs.append( rec )
                    #log('TP: %r' % (rec,))

        if is_annotated_TP:
            num_assertions[target_id]['is_annotated_TP'] += 1

        increment_CM(is_annotated_TP, conf=conf, cutoffs=cutoffs, CM=CM, 
                     mode=DOCS, 
                     target_id=target_id, unannotated_is_TN=unannotated_is_TN)

    correct_FN(CM, DOCS, positives)

    if debug:
        print 'showing assertion counts:'
        print json.dumps(num_assertions, indent=4, sort_keys=True)

    ## sort by date_hour
    DOCS_TPs.sort(key=itemgetter(5))

    return CM, DOCS_TPs
Example #16
0
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    ## Load in the annotation data
    annotation = load_annotation(args.annotation, args.include_useful, args.include_neutral, 
                                 args.min_len_clean_visible, reject)
    log( 'This assumes that all run file names end in .gz' )

    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            continue
        
        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log( 'processing: %s.gz' % run_file_name )
        
        ## Generate the confusion matrix for a run
        CM = score_confusion_matrix(
            os.path.join(args.run_dir, run_file), 
            annotation, args.cutoff_step, args.unan_is_true, args.include_training,
            debug=args.debug)

        ## Generate performance metrics for a run
        Scores = performance_metrics(CM)
        
        ## Generate the average metrics
        (CM['average'], Scores['average']) = full_run_metrics(CM, Scores, args.use_micro_averaging)

        max_scores = find_max_scores(Scores)

        ## split into team name and create stats file
        team_id, system_id = run_file_name.split('-')
        team_scores[team_id][system_id] = max_scores

        ## Print the top F-Score
        log( '   max(avg(F_1)): %.3f' % max_scores['average']['F'] )
        log( '   max(F_1(avg(P), avg(R))): %.3f' % max_scores['average']['F_recomputed'] )
        log( '   max(avg(SU)):  %.3f' % max_scores['average']['SU'] )
        
        base_output_filepath = os.path.join(
            args.run_dir, 
            run_file_name + '-' + description)

        output_filepath = base_output_filepath + '.csv'
        write_performance_metrics(output_filepath, CM, Scores)
        log( ' wrote metrics table to %s' % output_filepath )
        
        if not plt:
            log( ' not generating plot, because could not import matplotlib' )
        else:
            ## Output a graph of the key performance statistics
            graph_filepath = base_output_filepath + '.png'
            write_graph(graph_filepath, Scores['average'])
            log( ' wrote plot image to %s' % graph_filepath )

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)
Example #17
0
                    pooled_assertion_keys=None):
    '''
    Loads the SSF truth data from its JSON format on disk
    
    path_to_annotation_file: string file system path to the JSON annotation file
    
    reject:  callable that returns boolean given a target_id
    '''
    try:
        native_annotation = json.load(open(path_to_annotation_file))
    except Exception, exc:
        sys.exit( 'failed to open %r:\n%s' % (path_to_annotation_file, traceback.format_exc(exc)) )

    for target_id in native_annotation.keys():
        if reject(target_id):
            log('excluding truth data for %s' % target_id)
            native_annotation.pop(target_id)

    ## invert the annotation file to have a stream_id index pointing
    ## to target_ids point to slot_types pointing to slot fills,
    ## instead of the reverse
    # stream_id --> target_id --> slot_types
    annotation = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

    unofficial_slots = ['SignificantOther', 'Children']

    for target_id, slots in native_annotation.items():
        for slot_type, fills in slots.items():
            if slot_type_filter and slot_type != slot_type_filter:
                log('excluding truth data for %s' % slot_type)
                continue
Example #18
0
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    if args.include_neutral:
        thresh = 0
    elif args.include_useful:
        thresh = 1
    else:
        thresh = 2

    ## Load in the annotation data
    annotation = load_annotation(
        args.annotation,
        thresh,
        args.min_len_clean_visible,
        reject,
        require_positives=args.require_positives,
        any_up=args.any_up,
        restricted_entity_list=args.restricted_entity_list,
    )
    log('This assumes that all run file names end in .gz')

    #import gc
    #from guppy import hpy
    #hp = hpy()

    run_count = 0
    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            continue

        if args.run_name_filter and not run_file.startswith(
                args.run_name_filter):
            continue

        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log('processing: %s.gz' % run_file_name)

        try:
            max_scores = process_run(args, run_file_name, annotation,
                                     description, thresh)

            ## split into team name and create stats file
            team_id, system_id = run_file_name.split('-')
            team_scores[team_id][system_id] = max_scores

        except Exception, exc:
            logger.critical('died on %s', run_file_name, exc_info=True)
            sys.exit(str(exc))

        #gc.collect()
        #log(str(hp.heap()))

        run_count += 1
Example #19
0
    accepted_target_ids = set()
    if args.group or args.entity_type:
        if not args.topics_path:
            sys.exit('must specify --topics-path to use --group')
        targets = json.load(open(args.topics_path))['targets']
        for targ in targets:
            if ('group' in targ and targ.get('group')
                    == args.group) or targ['entity_type'] == args.entity_type:
                accepted_target_ids.add(targ['target_id'])

    if args.restricted_entity_list:
        args.restricted_entity_list = set(
            open(args.restricted_entity_list).read().splitlines())
        log('loaded %d entities into restricted_entity_list:\n%s' %
            (len(args.restricted_entity_list), '\n'.join(
                args.restricted_entity_list)))

    description = make_description(args)

    ## construct reject callable
    def reject(target_id):
        if args.reject_twitter and 'twitter.com' in target_id:
            return True
        if args.reject_wikipedia and 'wikipedia.org' in target_id:
            return True
        if args.group or args.entity_type:
            if target_id not in accepted_target_ids:
                return True  ## i.e. reject it
        return False
Example #20
0
def score_confusion_matrix_DOCS(run_file_handle, annotation, positives,
                           cutoff_step_size=50, unannotated_is_TN=False, debug=False):
    '''
    read a run submission and generate a confusion matrix (number of
    true/false positives and true/false negatives) for DOCS mode
    evaluation.  Generate a confusion matrix for each cutoff step and
    each mode.
    
    run_file_handle: str, a filesystem link to the run submission 
    annotation: dict, containing the annotation data
    cutoff_step_size: int, increment between cutoffs
    unannotated_is_TN: boolean, true to count unannotated as negatives
    
    returns a confusion matrix dictionary for each target_id 
    '''
    ## Create a dictionary containing the confusion matrix (CM)
    cutoffs = range(0, 999, cutoff_step_size)

    def init_confusion_matrix():
        return dict(TP=0, FP=0, FN=0, TN=0)

    ## confusion matrix is mode-->target_id-->cutoff-->2-by-2 matrix
    CM = {mode: defaultdict(lambda: defaultdict(init_confusion_matrix))
          for mode in MODES}

    ## count the total number of assertions per entity
    num_assertions = {}

    ## keep assertions that are in the annotation set, because this is
    ## much smaller than the entire run submission.  We will pass this
    ## to the four evaluation steps beyond DOCS.
    DOCS_TPs = list()

    ## Iterate through every row of the run
    for onerow in run_file_handle:
        ## Skip Comments         
        if onerow.startswith('#') or len(onerow.strip()) == 0:
            continue

        row = onerow.split()
        assert len(row) == 11, row
        try:
            stream_id = row[2]
            timestamp = int(stream_id.split('-')[0])
            target_id = row[3]
            conf = int(float(row[4]))

            rating = int(row[5])
            contains_mention = int(row[6])
            date_hour = row[7]
            slot_type = row[8]
            equiv_id = row[9]
            start_byte, end_byte = row[10].split('-')
            start_byte = int(start_byte)
            end_byte = int(end_byte)

        except Exception, exc:
            print repr(row)
            sys.exit(traceback.format_exc(exc))

        if target_id not in num_assertions:
            num_assertions[target_id] = {'total': 0,
                                         'is_annotated_TP': 0}

        ## keep track of total number of assertions per entity
        num_assertions[target_id]['total'] += 1
        
        ## all modes start with DOCS, so is_annotated_TP means that
        ## the system has a DOCS-TP above some conf threshold
        is_annotated_TP = False
        if stream_id in annotation:
            if target_id in annotation[stream_id]:
                if slot_type in annotation[stream_id][target_id]:
                    is_annotated_TP = True
                    rec = (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, equiv_id, start_byte, end_byte)
                    DOCS_TPs.append( rec )
                    log('TP: %r' % (rec,))

        if is_annotated_TP:
            num_assertions[target_id]['is_annotated_TP'] += 1

        increment_CM(is_annotated_TP, conf, cutoffs, CM, DOCS, target_id, unannotated_is_TN)
Example #21
0
def ssf_runs(args):
    '''
    yield file handles for all of the SSF runs
    '''

    log( 'This assumes that all run file names end in .gz' )
    run_count = 0
    for run_file_name in os.listdir(args.run_dir):
        if not run_file_name.endswith('.gz'):
            log( 'ignoring: %s' % run_file_name )
            continue

        if args.run_name_filter and not run_file_name.startswith(args.run_name_filter):
            log( 'ignoring: %s' % run_file_name)
            continue

        ## Open the run file    
        run_file_path = os.path.join(args.run_dir, run_file_name)
        if run_file_path.endswith('.gz'):
            run_file_handle = gzip.open(run_file_path, 'r')
        else:
            run_file_handle =      open(run_file_path, 'r')

        first_line = run_file_handle.readline()
        assert first_line.startswith('#')
        try:
            filter_run = json.loads(first_line[1:])
        except:
            sys.exit('failed to get JSON out of: %r' % first_line[1:])
        
        ### many CCR runs, including some from organizers have task_id
        ### set to SSF :-(, so we must detect this.
        ## read to first non-comment line
        second_line = None
        while not second_line:
            second_line = run_file_handle.readline()
            if second_line.strip().startswith('#'):
                second_line = None

        if 'NULL' in second_line or filter_run['task_id'] != 'kba-ssf-2013':
            log( 'ignoring non-SSF run: %s' % run_file_name )
            continue

        ## Open run file again now that we verified it is SSF
        run_file_path = os.path.join(args.run_dir, run_file_name)
        if run_file_path.endswith('.gz'):
            run_file_handle = gzip.open(run_file_path, 'r')
        else:
            run_file_handle =      open(run_file_path, 'r')

        log( 'processing: %s' % run_file_name )
        log( json.dumps(filter_run, indent=4, sort_keys=True) )

        yield run_file_name, run_file_handle
Example #22
0
def score_confusion_matrix_OVERLAP(CM, DOCS_TPs, annotation, positives,
                                cutoff_step_size=50, unannotated_is_TN=False,
                                debug=False):
    '''
    construct OVERLAP_TPs by excluding from DOCS_TPs those assertions
    that do not overlap any string identified by an assessor
    '''
    cutoffs = range(0, 999, cutoff_step_size)

    OVERLAP_TPs = dict()

    log('considering %d unique OVERLAP assertions' % len(DOCS_TPs))
    for rec in DOCS_TPs:
        (stream_id, target_id, conf, rating, contains_mention, 
         date_hour, slot_type, runs_equiv_id, start_byte, end_byte) = rec

        if positives[OVERLAP].get(target_id, 0) == 0:
            log('ignoring assertion on entity for which no OVERLAP positives are known: %s' % target_id)
            continue

        start_byte = int(start_byte)
        end_byte = int(end_byte)

        for true_equiv_id, equiv_class in annotation[stream_id][target_id][slot_type].items():
            offsets = equiv_class['stream_ids'][stream_id][1]
            overlaps = False
            for offset in offsets:
                assert isinstance(offset[0], int)
                assert isinstance(offset[1], int)

                ## we could/should be much stricter here, 10x is a big window
                true_len = offset[1] - offset[0]
                runs_len = end_byte - start_byte
                if start_byte <= offset[1] and end_byte >= offset[0] and runs_len < 10 * true_len:
                    overlaps = True
                    break

            #log('(%d, %d) compared to offsets %r\n' % (start_byte, end_byte, offsets))

            if not overlaps:
                increment_CM(False, conf=conf, cutoffs=cutoffs, CM=CM, 
                             mode=OVERLAP, 
                             target_id=target_id, unannotated_is_TN=unannotated_is_TN)

            #log('found one!!  system equiv_id (%r) --> assessors equiv_id (%r)'
            #    % (runs_equiv_id, true_equiv_id))
            rec = list(rec)
            rec[7] = (runs_equiv_id, true_equiv_id)
            rec = tuple(rec)

            assertion_key = (stream_id, target_id, slot_type, start_byte, end_byte)
            if assertion_key in OVERLAP_TPs:
                other_row = OVERLAP_TPs[assertion_key]
                if other_row[4] > conf:
                    log('ignoring a duplicate row with lower conf: %d > %d'
                        % (other_row[4], conf))
                    continue

            OVERLAP_TPs[assertion_key] = rec

            increment_CM(True, conf=conf, cutoffs=cutoffs, CM=CM, 
                         mode=OVERLAP, 
                         target_id=target_id, unannotated_is_TN=unannotated_is_TN)

    correct_FN(CM, OVERLAP, positives)

    if OVERLAP_TPs:
        assert CM[OVERLAP]

    OVERLAP_TPs = OVERLAP_TPs.values()

    return CM, OVERLAP_TPs
Example #23
0
def load_annotation(path_to_annotation_file,
                    thresh,
                    min_len_clean_visible,
                    reject,
                    require_positives=False,
                    any_up=False,
                    restricted_entity_list=None):
    '''Loads the annotation file into a dict
    
    path_to_annotation_file: string filesystem path to the annotation file
    include_useful: true to include docs marked useful and vital

    :param min_len_clean_visible: minimum length of the clean_visible,
    which is in the 12 column of the expanded truth data file

    :param reject:  callable that returns boolean given a target_id

    :param require_positives: if set to True, reject any target entity
    for which no true positives exist.

    :param restricted_entity_list: a list of target_id strings that
    are the only ones allowed in the annotation.

    '''
    assert -1 <= thresh <= 2, thresh

    annotation_file = csv.reader(open(path_to_annotation_file, 'r'),
                                 delimiter='\t')

    annotation = dict()
    for row in annotation_file:
        ## Skip comments
        if row[0][0] == "#":
            continue

        stream_id = row[2]
        target_id = row[3]
        rating = int(row[5])
        assert -1 <= rating <= 2, rating

        if len(row) == 12:
            ## only the later versions of the truth data carried this
            ## twelve column for excluding documents with insufficient
            ## clean_visible to be judged.  We use a default cutoff of
            ## 100 bytes which means removing these counts below:
            #              (stream_id, target_id) pairs:  34921 above, and 15767 below 100 bytes of clean_visible
            # (assessor_id, stream_id, target_id) pairs:  47446 above, and 19948 below 100 bytes of clean_visible
            len_clean_visible = int(row[11])
            if len_clean_visible < min_len_clean_visible:
                log('excluding stream_id=%s for len(clean_visible)=%d' %
                    (stream_id, len_clean_visible))
                continue

        if reject(target_id):
            log('excluding truth data for %s' % target_id)
            continue

        if restricted_entity_list and target_id not in restricted_entity_list:
            log('not in restricted_entity_list: %s' % target_id)
            continue

        ## Add the stream_id and target_id to a hashed dictionary
        ## 0 means that its not vital 1 means that it is vital

        if (stream_id, target_id) in annotation:
            ## if rating is below threshold, then some assessor viewed
            ## it as not good enough, so be conservative and downgrade
            if not any_up and rating < thresh:
                ## default any_up=False means that if *any* assessor
                ## voted *against* the assertion, then *exclude* it
                annotation[(stream_id, target_id)] = False

            elif any_up and rating >= thresh:
                ## any_up means that if *any* assessor voted *for* the
                ## assertion, then *include* it
                annotation[(stream_id, target_id)] = True

        else:
            ## store bool values in the annotation index
            annotation[(stream_id, target_id)] = rating >= thresh

    has_true = Counter()
    for (stream_id, target_id), is_true in annotation.items():
        if is_true:
            has_true[target_id] += 1

    if require_positives:
        for stream_id, target_id in annotation.keys():
            if has_true[target_id] < require_positives:
                log('rejecting %s for too few true positives: %d < %d = require_positives'\
                        % (target_id, has_true[target_id], require_positives))
                annotation.pop((stream_id, target_id))

    log('%d target_ids have at least one true positive' % len(has_true))

    num_true = sum(map(int, annotation.values()))
    log('loaded annotation to create a dict of %d (stream_id, target_id) pairs with %d True'
        % (len(annotation), num_true))
    if num_true == 0:
        sys.exit('found no true positives given the filters')
    return annotation