def process_run(args, run_file_name, annotation, description, thresh): ''' compute scores and generate output files for a single run :returns dict: max_scores for this one run ''' ## Generate confusion matrices from a run for each target_id ## and for each step of the confidence cutoff stats = build_confusion_matrix( os.path.join(args.run_dir, run_file_name) + '.gz', annotation, args.cutoff_step, args.unan_is_true, args.include_training, thresh=thresh, require_positives=args.require_positives, debug=args.debug) compile_and_average_performance_metrics(stats) max_scores = find_max_scores(stats) log(json.dumps(stats, indent=4, sort_keys=True)) base_output_filepath = os.path.join( args.run_dir, run_file_name + '-' + description) output_filepath = base_output_filepath + '.csv' write_performance_metrics(output_filepath, stats) ## Output a graph of the key performance statistics graph_filepath = base_output_filepath + '.png' write_graph(graph_filepath, stats) return max_scores
def process_run(args, run_file_name, annotation, description, thresh): ''' compute scores and generate output files for a single run :returns dict: max_scores for this one run ''' ## Generate confusion matrices from a run for each target_id ## and for each step of the confidence cutoff stats = build_confusion_matrix(os.path.join(args.run_dir, run_file_name) + '.gz', annotation, args.cutoff_step, args.unan_is_true, args.include_training, thresh=thresh, require_positives=args.require_positives, debug=args.debug) compile_and_average_performance_metrics(stats) max_scores = find_max_scores(stats) log(json.dumps(stats, indent=4, sort_keys=True)) base_output_filepath = os.path.join(args.run_dir, run_file_name + '-' + description) output_filepath = base_output_filepath + '.csv' write_performance_metrics(output_filepath, stats) ## Output a graph of the key performance statistics graph_filepath = base_output_filepath + '.png' write_graph(graph_filepath, stats) return max_scores
def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' if args.include_neutral: thresh = 0 elif args.include_useful: thresh = 1 else: thresh = 2 ## Load in the annotation data annotation = load_annotation(args.annotation, thresh, args.min_len_clean_visible, reject, require_positives=args.require_positives ) log( 'This assumes that all run file names end in .gz' ) #import gc #from guppy import hpy #hp = hpy() run_count = 0 team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): continue if args.run_name_filter and not run_file.startswith(args.run_name_filter): continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log( 'processing: %s.gz' % run_file_name ) max_scores = process_run(args, run_file_name, annotation, description, thresh) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores #gc.collect() #log(str(hp.heap())) run_count += 1 #if run_count > 2: # break ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores)
def score_confusion_matrix_OVERLAP(CM, DOCS_TPs, annotation, positives, cutoff_step_size=50, unannotated_is_TN=False, debug=False): ''' construct OVERLAP_TPs by excluding from DOCS_TPs those assertions that do not overlap any string identified by an assessor ''' cutoffs = range(0, 999, cutoff_step_size) OVERLAP_TPs = list() for rec in DOCS_TPs: (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, runs_equiv_id, start_byte, end_byte) = rec start_byte = int(start_byte) end_byte = int(end_byte) for true_equiv_id, equiv_class in annotation[stream_id][target_id][slot_type].items(): offsets = equiv_class['stream_ids'][stream_id][1] overlaps = False for offset in offsets: assert isinstance(offset[0], int) assert isinstance(offset[1], int) if start_byte <= offset[1] and end_byte >= offset[0]: overlaps = True break log('(%d, %d) compared to offsets %r\n' % (start_byte, end_byte, offsets)) if not overlaps: increment_CM(False, conf, cutoffs, CM, OVERLAPS, unannotated_is_TN) #log('found one!! system equiv_id (%r) --> assessors equiv_id (%r)' # % (runs_equiv_id, true_equiv_id)) rec = list(rec) rec[7] = (runs_equiv_id, true_equiv_id) rec = tuple(rec) OVERLAP_TPs.append(rec) increment_CM(True, conf, cutoffs, CM, OVERLAPS, unannotated_is_TN) correct_FN(CM, OVERLAPS, positives) return CM, OVERLAP_TPs
def load_annotation(path_to_annotation_file, include_useful, include_neutral, min_len_clean_visible, reject): ''' Loads the annotation file into a dict path_to_annotation_file: string filesystem path to the annotation file include_useful: true to include docs marked useful and vital reject: callable that returns boolean given a target_id ''' annotation_file = csv.reader(open(path_to_annotation_file, 'r'), delimiter='\t') annotation = dict() for row in annotation_file: ## Skip comments if row[0][0] == "#": continue stream_id = row[2] target_id = row[3] rating = int(row[5]) if len(row) == 12: ## only the later versions of the truth data carried this ## twelve column for excluding documents with insufficient ## clean_visible to be judged. We use a default cutoff of ## 100 bytes which means removing these counts below: # (stream_id, target_id) pairs: 34921 above, and 15767 below 100 bytes of clean_visible # (assessor_id, stream_id, target_id) pairs: 47446 above, and 19948 below 100 bytes of clean_visible len_clean_visible = int(row[11]) if len_clean_visible < min_len_clean_visible: log('excluding stream_id=%s for len(clean_visible)=%d' % (stream_id, len_clean_visible)) continue if reject(target_id): log('excluding truth data for %s' % target_id) continue if include_neutral: thresh = 0 elif include_useful: thresh = 1 else: thresh = 2 ## Add the stream_id and target_id to a hashed dictionary ## 0 means that its not vital 1 means that it is vital if (stream_id, target_id) in annotation: ## 2 means the annotators gave it a yes for vitality if rating < thresh: annotation[(stream_id, target_id)] = False else: ## store bool values in the annotation index annotation[(stream_id, target_id)] = rating >= thresh num_true = sum(map(int, annotation.values())) log('loaded annotation to create a dict of %d (stream_id, target_id) pairs with %d True' % (len(annotation), num_true)) if num_true == 0: sys.exit('found no true positives given the filters') return annotation
def score_confusion_matrix_DATE_HOUR(CM, FILL_TPs, annotation, positives, cutoff_step_size=50, unannotated_is_TN=False, debug=False): ''' construct DATE_HOUR_TPs by excluding from FILL_TPs those assertions that happen after the first one ''' cutoffs = range(0, 999, cutoff_step_size) ## FILL_TPs are already in date_hour order, so we only have to ## count the first one for each equiv_id seen = set() DATE_HOUR_TPs = list() log('considering %d unique DATE_HOUR assertions' % len(FILL_TPs)) for rec in FILL_TPs: (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, equiv_id, start_byte, end_byte) = rec if positives[DATE_HOUR].get(target_id, 0) == 0: log('ignoring assertion on entity for which no DATE_HOUR positives are known: %s' % target_id) continue if equiv_id in seen: increment_CM(False, conf=conf, cutoffs=cutoffs, CM=CM, mode=DATE_HOUR, target_id=target_id, unannotated_is_TN=unannotated_is_TN) continue ## this way of filtering is inadequate -- should be giving ## partial credit for finding slot fill late seen.add(equiv_id) DATE_HOUR_TPs.append(rec) increment_CM(True, conf=conf, cutoffs=cutoffs, CM=CM, mode=DATE_HOUR, target_id=target_id, unannotated_is_TN=unannotated_is_TN) for cutoff in CM[DATE_HOUR][target_id]: ## Then subtract the number of TP at each cutoffs ## (since FN+TP==True things in annotation set) CM[DATE_HOUR][target_id][cutoff]['FN'] = \ positives[DATE_HOUR][target_id] - CM[DATE_HOUR][target_id][cutoff]['TP'] log('considering %d DATE_HOUR_TPs' % len(DATE_HOUR_TPs)) return CM, DATE_HOUR_TPs
def load_annotation(path_to_annotation_file, reject, slot_type_filter=None): ''' Loads the SSF truth data from its JSON format on disk path_to_annotation_file: string file system path to the JSON annotation file reject: callable that returns boolean given a target_id ''' try: native_annotation = json.load(open(path_to_annotation_file)) except Exception, exc: sys.exit( 'failed to open %r:\n%s' % (path_to_annotation_file, traceback.format_exc(exc)) ) for target_id in native_annotation.keys(): if reject(target_id): log('excluding truth data for %s' % target_id) native_annotation.pop(target_id) ## invert the annotation file to have a stream_id index pointing ## to target_ids point to slot_types pointing to slot fills, ## instead of the reverse annotation = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) for target_id, slots in native_annotation.items(): for slot_type, fills in slots.items(): if slot_type_filter and slot_type != slot_type_filter: log('excluding truth data for %s' % slot_type) continue elif slot_type in ['SignificantOther', 'Children']: log('excluding truth data for %s because not part of official slot inventory. To score this, use --slot-type' % slot_type) continue
def score_confusion_matrix(path_to_run_file, annotation, cutoff_step, unannotated_is_TN, include_training, debug): ''' This function generates the confusion matrix (number of true/false positives and true/false negatives. path_to_run_file: str, a filesystem link to the run submission annotation: dict, containing the annotation data cutoff_step: int, increment between cutoffs unannotated_is_TN: boolean, true to count unannotated as negatives include_training: boolean, true to include training documents returns a confusion matrix dictionary for each target_id ''' ## Open the run file if path_to_run_file.endswith('.gz'): run_file = gzip.open(path_to_run_file, 'r') else: run_file = open(path_to_run_file, 'r') ## Create a dictionary containing the confusion matrix (CM) cutoffs = range(0, 999, cutoff_step) CM = dict() ## count the total number of assertions per entity num_assertions = {} ## Iterate through every row of the run for onerow in run_file: ## Skip Comments if onerow.startswith('#') or len(onerow.strip()) == 0: continue row = onerow.split() stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] score = int(float(row[4])) if target_id not in num_assertions: num_assertions[target_id] = {'total': 0, 'in_TTR': 0, 'in_ETR': 0, 'in_annotation_set': 0} ## keep track of total number of assertions per entity num_assertions[target_id]['total'] += 1 if timestamp <= END_OF_FEB_2012: num_assertions[target_id]['in_TTR'] += 1 else: num_assertions[target_id]['in_ETR'] += 1 ## If the entity has been seen yet create a confusion matrix for it if not target_id in CM: CM[target_id] = dict() for cutoff in cutoffs: CM[target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0) if (not include_training) and (timestamp <= END_OF_FEB_2012): continue in_annotation_set = (stream_id, target_id) in annotation if in_annotation_set: num_assertions[target_id]['in_annotation_set'] += 1 ## In the annotation set and useful if in_annotation_set and annotation[(stream_id, target_id)]: for cutoff in cutoffs: if score > cutoff: ## If above the cutoff: true-positive CM[target_id][cutoff]['TP'] += 1 ## In the annotation set and non-useful elif in_annotation_set and not annotation[(stream_id, target_id)]: for cutoff in cutoffs: if score > cutoff: ## Above the cutoff: false-positive CM[target_id][cutoff]['FP'] += 1 else: ## Below the cutoff: true-negative CM[target_id][cutoff]['TN'] += 1 ## Not in the annotation set so its a negative (if flag is true) elif unannotated_is_TN: for cutoff in cutoffs: if score > cutoff: ## Above the cutoff: false-positive CM[target_id][cutoff]['FP'] += 1 else: ## Below the cutoff: true-negative CM[target_id][cutoff]['TN'] += 1 ## Correct FN for things in the annotation set that are NOT in the run ## First, calculate number of true things in the annotation set annotation_positives = defaultdict(int) for key in annotation: stream_id = key[0] timestamp = int(stream_id.split('-')[0]) if (not include_training) and (timestamp <= 1325375999): continue target_id = key[1] annotation_positives[target_id] += annotation[(stream_id,target_id)] for target_id in CM: for cutoff in CM[target_id]: ## Then subtract the number of TP at each cutoffs ## (since FN+TP==True things in annotation set) CM[target_id][cutoff]['FN'] = annotation_positives[target_id] - CM[target_id][cutoff]['TP'] if debug: log( 'showing assertion counts:' ) log( json.dumps(num_assertions, indent=4, sort_keys=True) ) return CM
def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' if args.include_neutral: thresh = 0 elif args.include_useful: thresh = 1 else: thresh = 2 ## Load in the annotation data annotation = load_annotation(args.annotation, thresh, args.min_len_clean_visible, reject, require_positives=args.require_positives ) log( 'This assumes that all run file names end in .gz' ) annotationWriter = open('validassessments.csv', 'w') for ((stream_id, target_id) , is_pos) in annotation.iteritems(): #'dde6ec 1332929640-c50cda6bee1564a599ae620d8918382e http://en.wikipedia.org/wiki/Atacocha 1000 1 1332929640' timestamp = int(stream_id.split('-')[0]) assessment = 1 if is_pos else 0 annotationWriter.write('reserved\t%s\t%s\t1000\t%d\t%s\n'%(stream_id, target_id, assessment, timestamp)) annotationWriter.close() #import gc #from guppy import hpy #hp = hpy() run_count = 0 team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): print 'ignoring %s because it does not end on *gz'%run_file continue if args.run_name_filter and not run_file.startswith(args.run_name_filter): print 'filename filter set to %s, but does not match %s'%(args.run_name_filter, run_file) continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log( 'processing: %s.gz' % run_file_name ) print( 'processing: %s.gz' % run_file_name ) max_scores = process_run(args, run_file_name, annotation, description, thresh) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores #gc.collect() #log(str(hp.heap())) run_count += 1 #if run_count > 2: # break ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores)
def build_confusion_matrix(path_to_run_file, annotation, cutoff_step, unannotated_is_TN, include_training, debug, thresh=2, require_positives=False): ''' This function generates the confusion matrix (number of true/false positives and true/false negatives. path_to_run_file: str, a filesystem link to the run submission annotation: dict, containing the annotation data cutoff_step: int, increment between cutoffs unannotated_is_TN: boolean, true to count unannotated as negatives include_training: boolean, true to include training documents returns a confusion matrix dictionary for each target_id ''' ## Open the run file if path_to_run_file.endswith('.gz'): run_file = gzip.open(path_to_run_file, 'r') else: run_file = open(path_to_run_file, 'r') ## Create a dictionary containing the confusion matrix (CM) cutoffs = range(0, 999, cutoff_step) CM = dict() ## count the total number of assertions per entity num_assertions = {} num_positives = defaultdict(int) for (stream_id, target_id), is_positive in annotation.items(): ## compute total counts of number of positives for each target_id if is_positive: num_positives[target_id] += 1 ## make sure that the confusion matrix has entries for all entities if target_id not in CM: CM[target_id] = dict() for cutoff in cutoffs: CM[target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0) ## Iterate through every row of the run and construct a ## de-duplicated run summary run_set = dict() for onerow in run_file: ## Skip Comments if onerow.startswith('#') or len(onerow.strip()) == 0: continue row = onerow.split() stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] conf = int(float(row[4])) assert 0 < conf <= 1000 row[4] = conf rating = int(row[5]) assert -1 <= rating <= 2 row[5] = rating #log('ratings: %r <?> %r' % (rating, thresh)) if rating < thresh: log('ignoring assertion below the rating threshold: %r < %r' % (rating, thresh)) continue if require_positives and num_positives.get(target_id, 0) == 0: log('ignoring assertion on entity for which no CCR positives are known: %s' % target_id) continue assertion_key = (stream_id, target_id) if assertion_key in run_set: other_row = run_set[assertion_key] if other_row[4] > conf: log('ignoring a duplicate row with lower conf: %d > %d' % (other_row[4], conf)) continue if other_row[4] == conf: ## compare rating level if other_row[5] != rating: log('same conf, different rating:\n%r\n%r\ntaking higher rating' % (row, other_row)) ## accept higher rating if other_row[5] > rating: continue #log('got a row: %r' % (row,)) run_set[assertion_key] = row log('considering %d assertions' % len(run_set)) run_set = run_set.values() while run_set: row = run_set.pop() stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] conf = row[4] rating = row[5] if target_id not in num_assertions: num_assertions[target_id] = {'total': 0, 'in_TTR': 0, 'in_ETR': 0, 'in_annotation_set': 0} ## keep track of total number of assertions per entity num_assertions[target_id]['total'] += 1 if timestamp <= END_OF_FEB_2012: num_assertions[target_id]['in_TTR'] += 1 else: num_assertions[target_id]['in_ETR'] += 1 if (not include_training) and (timestamp <= END_OF_FEB_2012): continue in_annotation_set = (stream_id, target_id) in annotation if in_annotation_set: num_assertions[target_id]['in_annotation_set'] += 1 ## In the annotation set and useful if in_annotation_set and annotation[(stream_id, target_id)]: for cutoff in cutoffs: if conf > cutoff: ## If above the cutoff: true-positive CM[target_id][cutoff]['TP'] += 1 ## In the annotation set and non-useful elif in_annotation_set and not annotation[(stream_id, target_id)]: for cutoff in cutoffs: if conf > cutoff: ## Above the cutoff: false-positive CM[target_id][cutoff]['FP'] += 1 else: ## Below the cutoff: true-negative CM[target_id][cutoff]['TN'] += 1 ## Not in the annotation set so its a negative (if flag is true) elif unannotated_is_TN: for cutoff in cutoffs: if conf > cutoff: ## Above the cutoff: false-positive CM[target_id][cutoff]['FP'] += 1 else: ## Below the cutoff: true-negative CM[target_id][cutoff]['TN'] += 1 ## Correct FN for things in the annotation set that are NOT in the run ## First, calculate number of true things in the annotation set annotation_positives = defaultdict(int) for stream_id, target_id in annotation: timestamp = int(stream_id.split('-')[0]) if (not include_training) and (timestamp <= END_OF_FEB_2012): continue annotation_positives[target_id] += int(annotation[(stream_id,target_id)]) for target_id in CM: for cutoff in CM[target_id]: ## Then subtract the number of TP at each cutoffs ## (since FN+TP==True things in annotation set) #log('annotation_positives[%s] = %d' % (target_id, annotation_positives[target_id])) #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff])) CM[target_id][cutoff]['FN'] = annotation_positives[target_id] - CM[target_id][cutoff]['TP'] #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff])) assert annotation_positives[target_id] >= CM[target_id][cutoff]['TP'], \ "how did we get more TPs than available annotation_positives[target_id=%s] = %d >= %d = CM[target_id][cutoff=%f]['TP']" \ % (target_id, annotation_positives[target_id], CM[target_id][cutoff]['TP'], cutoff) log( 'showing assertion counts:' ) log( json.dumps(num_assertions, indent=4, sort_keys=True) ) return CM
def load_annotation(path_to_annotation_file, thresh, min_len_clean_visible, reject, require_positives=False, any_up=False): ''' Loads the annotation file into a dict path_to_annotation_file: string filesystem path to the annotation file include_useful: true to include docs marked useful and vital :param min_len_clean_visible: minimum length of the clean_visible, which is in the 12 column of the expanded truth data file :param reject: callable that returns boolean given a target_id :param require_positives: if set to True, reject any target entity for which no true positives exist. ''' assert -1 <= thresh <= 2, thresh annotation_file = csv.reader(open(path_to_annotation_file, 'r'), delimiter='\t') annotation = dict() for row in annotation_file: ## Skip comments if row[0][0] == "#": continue stream_id = row[2] target_id = row[3] rating = int(row[5]) assert -1 <= rating <=2, rating if len(row) == 12: ## only the later versions of the truth data carried this ## twelve column for excluding documents with insufficient ## clean_visible to be judged. We use a default cutoff of ## 100 bytes which means removing these counts below: # (stream_id, target_id) pairs: 34921 above, and 15767 below 100 bytes of clean_visible # (assessor_id, stream_id, target_id) pairs: 47446 above, and 19948 below 100 bytes of clean_visible len_clean_visible = int(row[11]) if len_clean_visible < min_len_clean_visible: log('excluding stream_id=%s for len(clean_visible)=%d' % (stream_id, len_clean_visible)) continue if reject(target_id): log('excluding truth data for %s' % target_id) continue ## Add the stream_id and target_id to a hashed dictionary ## 0 means that its not vital 1 means that it is vital if (stream_id, target_id) in annotation: ## if rating is below threshold, then some assessor viewed ## it as not good enough, so be conservative and downgrade if not any_up and rating < thresh: ## default any_up=False means that if *any* assessor ## voted *against* the assertion, then *exclude* it annotation[(stream_id, target_id)] = False elif any_up and rating >= thresh: ## any_up means that if *any* assessor voted *for* the ## assertion, then *include* it annotation[(stream_id, target_id)] = True else: ## store bool values in the annotation index annotation[(stream_id, target_id)] = rating >= thresh has_true = set() for (stream_id, target_id), is_true in annotation.items(): if is_true: has_true.add(target_id) if require_positives: for stream_id, target_id in annotation.keys(): if target_id not in has_true: log('rejecting %s for lack of any true positives -- because require_positives=True' % target_id) annotation.pop( (stream_id, target_id) ) log('%d target_ids have at least one true positive' % len(has_true)) num_true = sum(map(int, annotation.values())) log('loaded annotation to create a dict of %d (stream_id, target_id) pairs with %d True' % (len(annotation), num_true)) if num_true == 0: sys.exit('found no true positives given the filters') return annotation
help='beginning of string of filename to filter runs that get considered') args = parser.parse_args() accepted_target_ids = set() if args.group or args.entity_type: if not args.topics_path: sys.exit('must specify --topics-path to use --group') targets = json.load(open(args.topics_path))['targets'] for targ in targets: if targ['group'] == args.group or targ['entity_type'] == args.entity_type: accepted_target_ids.add(targ['target_id']) description = make_description(args) ## construct reject callable def reject(target_id): if args.reject_twitter and 'twitter.com' in target_id: return True if args.reject_wikipedia and 'wikipedia.org' in target_id: return True if args.group or args.entity_type: if target_id not in accepted_target_ids: return True ## i.e. reject it return False score_all_runs(args, description, reject) elapsed = time.time() - start_time log('finished after %d seconds at at %r' % (elapsed, datetime.utcnow()))
def build_confusion_matrix(path_to_run_file, annotation, cutoff_step, unannotated_is_TN, include_training, debug, thresh=2, require_positives=0): ''' This function generates the confusion matrix (number of true/false positives and true/false negatives. path_to_run_file: str, a filesystem link to the run submission annotation: dict, containing the annotation data from *after* the cutoff cutoff_step: int, increment between cutoffs unannotated_is_TN: boolean, true to count unannotated as negatives include_training: boolean, true to include training documents returns a confusion matrix dictionary for each target_id ''' ## Open the run file if path_to_run_file.endswith('.gz'): run_file = gzip.open(path_to_run_file, 'r') else: run_file = open(path_to_run_file, 'r') ## Create a dictionary containing the confusion matrix (CM) cutoffs = range(0, 999, cutoff_step) CM = dict() ## count the total number of assertions per entity num_assertions = {} num_positives = defaultdict(int) for (stream_id, target_id), is_positive in annotation.items(): ## compute total counts of number of positives for each target_id if is_positive: num_positives[target_id] += 1 ## make sure that the confusion matrix has entries for all entities if target_id not in CM: CM[target_id] = dict() for cutoff in cutoffs: CM[target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0) ## Iterate through every row of the run and construct a ## de-duplicated run summary run_set = dict() for onerow in run_file: ## Skip Comments if onerow.startswith('#') or len(onerow.strip()) == 0: continue row = onerow.split() stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] conf = int(float(row[4])) assert 0 < conf <= 1000 row[4] = conf rating = int(row[5]) assert -1 <= rating <= 2 row[5] = rating #log('ratings: %r <?> %r' % (rating, thresh)) if rating < thresh: log('ignoring assertion below the rating threshold: %r < %r' % (rating, thresh)) continue if num_positives.get(target_id, 0) < require_positives: log('ignoring assertion on entity for which no CCR positives are known: %s' % target_id) continue assertion_key = (stream_id, target_id) if assertion_key in run_set: other_row = run_set[assertion_key] if other_row[4] > conf: log('ignoring a duplicate row with lower conf: %d > %d' % (other_row[4], conf)) continue if other_row[4] == conf: ## compare rating level if other_row[5] != rating: log('same conf, different rating:\n%r\n%r\ntaking higher rating' % (row, other_row)) ## accept higher rating if other_row[5] > rating: continue #log('got a row: %r' % (row,)) run_set[assertion_key] = row log('considering %d assertions' % len(run_set)) run_set = run_set.values() while run_set: row = run_set.pop() stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] conf = row[4] rating = row[5] if target_id not in num_assertions: num_assertions[target_id] = { 'total': 0, 'in_ETR': 0, 'in_annotation_set': 0 } ## keep track of total number of assertions per entity num_assertions[target_id]['total'] += 1 num_assertions[target_id]['in_ETR'] += 1 in_annotation_set = (stream_id, target_id) in annotation if in_annotation_set: num_assertions[target_id]['in_annotation_set'] += 1 ## In the annotation set and useful if in_annotation_set and annotation[(stream_id, target_id)]: for cutoff in cutoffs: if conf > cutoff: ## If above the cutoff: true-positive CM[target_id][cutoff]['TP'] += 1 ## In the annotation set and non-useful elif in_annotation_set and not annotation[(stream_id, target_id)]: for cutoff in cutoffs: if conf > cutoff: ## Above the cutoff: false-positive CM[target_id][cutoff]['FP'] += 1 else: ## Below the cutoff: true-negative CM[target_id][cutoff]['TN'] += 1 ## Not in the annotation set so its a negative (if flag is true) elif unannotated_is_TN: for cutoff in cutoffs: if conf > cutoff: ## Above the cutoff: false-positive CM[target_id][cutoff]['FP'] += 1 else: ## Below the cutoff: true-negative CM[target_id][cutoff]['TN'] += 1 ## Correct FN for things in the annotation set that are NOT in the run ## First, calculate number of true things in the annotation set annotation_positives = defaultdict(int) for stream_id, target_id in annotation: timestamp = int(stream_id.split('-')[0]) annotation_positives[target_id] += int(annotation[(stream_id, target_id)]) for target_id in CM: for cutoff in CM[target_id]: ## Then subtract the number of TP at each cutoffs ## (since FN+TP==True things in annotation set) #log('annotation_positives[%s] = %d' % (target_id, annotation_positives[target_id])) #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff])) CM[target_id][cutoff]['FN'] = annotation_positives[target_id] - CM[ target_id][cutoff]['TP'] #log('CN[%s][cutoff=%d] = %r' % (target_id, cutoff, CM[target_id][cutoff])) assert annotation_positives[target_id] >= CM[target_id][cutoff]['TP'], \ "how did we get more TPs than available annotation_positives[target_id=%s] = %d >= %d = CM[target_id][cutoff=%f]['TP']" \ % (target_id, annotation_positives[target_id], CM[target_id][cutoff]['TP'], cutoff) log('showing assertion counts:') log(json.dumps(num_assertions, indent=4, sort_keys=True)) return CM
def score_confusion_matrix_FILL(CM, OVERLAP_TPs, annotation, positives, unannotated_is_TN=False, cutoff_step_size=50, debug=False): ''' construct FILL_TPs by excluding from OVERLAP_TPs those assertions that either: 1) re-use an earlier (run)equiv_id that was not associated with the same (truth)equiv_id from the truth set 1) fail to re-use an earlier (run)equiv_id that _was_ associated with a (truth)equiv_id from the truth set ''' cutoffs = range(0, 999, cutoff_step_size) FILL_TPs = dict() runs_to_true = dict() true_to_runs = dict() log('considering %d unique FILL assertions' % len(OVERLAP_TPs)) for rec in OVERLAP_TPs: (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, (runs_equiv_id, true_equiv_id), start_byte, end_byte) = rec if positives[FILL].get(target_id, 0) == 0: log('ignoring assertion on entity for which no FILL positives are known: %s' % target_id) continue ## this is a tri-state variable FILL_correct = None if runs_equiv_id not in runs_to_true and true_equiv_id not in true_to_runs: runs_to_true[runs_equiv_id] = true_equiv_id true_to_runs[true_equiv_id] = runs_equiv_id else: ## check failure mode #1 in __doc__ string if runs_equiv_id in runs_to_true: ## run has previously asserted this equiv_id if true_equiv_id == runs_to_true[runs_equiv_id]: FILL_correct = True else: FILL_correct = False ## check failure mode #2 in __doc__ string if true_equiv_id in true_to_runs: if runs_equiv_id == true_to_runs[true_equiv_id]: if FILL_correct is not False: FILL_correct = True else: FILL_correct = False if FILL_correct in [True, None]: assertion_key = (stream_id, target_id, slot_type, true_equiv_id) if assertion_key in FILL_TPs: other_row = FILL_TPs[assertion_key] if other_row[4] > conf: log('ignoring a duplicate row with lower conf: %d > %d' % (other_row[4], conf)) continue FILL_TPs[assertion_key] = rec increment_CM(FILL_correct, conf=conf, cutoffs=cutoffs, CM=CM, mode=FILL, target_id=target_id, unannotated_is_TN=unannotated_is_TN) correct_FN(CM, FILL, positives) FILL_TPs = FILL_TPs.values() return CM, FILL_TPs
def score_confusion_matrix_DOCS(run_file_handle, annotation, positives, cutoff_step_size=50, unannotated_is_TN=False, debug=False): ''' read a run submission and generate a confusion matrix (number of true/false positives and true/false negatives) for DOCS mode evaluation. Generate a confusion matrix for each cutoff step and each mode. run_file_handle: str, a filesystem link to the run submission annotation: dict, containing the annotation data cutoff_step_size: int, increment between cutoffs unannotated_is_TN: boolean, true to count unannotated as negatives returns a confusion matrix dictionary for each target_id ''' ## Create a dictionary containing the confusion matrix (CM) cutoffs = range(0, 999, cutoff_step_size) def init_confusion_matrix(): return dict(TP=0, FP=0, FN=0, TN=0) ## confusion matrix is mode-->target_id-->cutoff-->2-by-2 matrix CM = {mode: defaultdict(lambda: defaultdict(init_confusion_matrix)) for mode in MODES} for stream_id in annotation: for target_id in annotation[stream_id]: for mode in MODES: ## make sure that the confusion matrix has entries for all entities if target_id not in CM[mode]: CM[mode][target_id] = dict() for cutoff in cutoffs: CM[mode][target_id][cutoff] = dict(TP=0, FP=0, FN=0, TN=0) ## count the total number of assertions per entity num_assertions = {} ## keep assertions that are in the annotation set, because this is ## much smaller than the entire run submission. We will pass this ## to the four evaluation steps beyond DOCS. DOCS_TPs = list() ## Iterate through every row of the run and construct a ## de-duplicated run summary run_set = dict() for assertion_key, row in assertions(run_file_handle): conf = row[4] stream_id, target_id, slot_type = assertion_key if positives[DOCS].get(target_id, 0) == 0: #log('ignoring assertion on entity for which no DOCS positives are known: %s' % target_id) continue if assertion_key in run_set: other_row = run_set[assertion_key] if other_row[4] > conf: log('ignoring a duplicate row with lower conf: %d > %d' % (other_row[4], conf)) continue #log('got a row: %r' % (row,)) run_set[assertion_key] = row log('considering %d unique DOCS assertions' % len(run_set)) for row in run_set.values(): stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] conf = row[4] rating = row[5] contains_mention = int(row[6]) date_hour = row[7] slot_type = row[8] equiv_id = row[9] start_byte, end_byte = row[10].split('-') start_byte = int(start_byte) end_byte = int(end_byte) if target_id not in num_assertions: num_assertions[target_id] = {'total': 0, 'is_annotated_TP': 0} ## keep track of total number of assertions per entity num_assertions[target_id]['total'] += 1 ## all modes start with DOCS, so is_annotated_TP means that ## the system has a DOCS-TP above some conf threshold is_annotated_TP = False if stream_id in annotation: if target_id in annotation[stream_id]: if slot_type in annotation[stream_id][target_id]: is_annotated_TP = True rec = (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, equiv_id, start_byte, end_byte) DOCS_TPs.append( rec ) #log('TP: %r' % (rec,)) if is_annotated_TP: num_assertions[target_id]['is_annotated_TP'] += 1 increment_CM(is_annotated_TP, conf=conf, cutoffs=cutoffs, CM=CM, mode=DOCS, target_id=target_id, unannotated_is_TN=unannotated_is_TN) correct_FN(CM, DOCS, positives) if debug: print 'showing assertion counts:' print json.dumps(num_assertions, indent=4, sort_keys=True) ## sort by date_hour DOCS_TPs.sort(key=itemgetter(5)) return CM, DOCS_TPs
def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' ## Load in the annotation data annotation = load_annotation(args.annotation, args.include_useful, args.include_neutral, args.min_len_clean_visible, reject) log( 'This assumes that all run file names end in .gz' ) team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log( 'processing: %s.gz' % run_file_name ) ## Generate the confusion matrix for a run CM = score_confusion_matrix( os.path.join(args.run_dir, run_file), annotation, args.cutoff_step, args.unan_is_true, args.include_training, debug=args.debug) ## Generate performance metrics for a run Scores = performance_metrics(CM) ## Generate the average metrics (CM['average'], Scores['average']) = full_run_metrics(CM, Scores, args.use_micro_averaging) max_scores = find_max_scores(Scores) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores ## Print the top F-Score log( ' max(avg(F_1)): %.3f' % max_scores['average']['F'] ) log( ' max(F_1(avg(P), avg(R))): %.3f' % max_scores['average']['F_recomputed'] ) log( ' max(avg(SU)): %.3f' % max_scores['average']['SU'] ) base_output_filepath = os.path.join( args.run_dir, run_file_name + '-' + description) output_filepath = base_output_filepath + '.csv' write_performance_metrics(output_filepath, CM, Scores) log( ' wrote metrics table to %s' % output_filepath ) if not plt: log( ' not generating plot, because could not import matplotlib' ) else: ## Output a graph of the key performance statistics graph_filepath = base_output_filepath + '.png' write_graph(graph_filepath, Scores['average']) log( ' wrote plot image to %s' % graph_filepath ) ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores)
pooled_assertion_keys=None): ''' Loads the SSF truth data from its JSON format on disk path_to_annotation_file: string file system path to the JSON annotation file reject: callable that returns boolean given a target_id ''' try: native_annotation = json.load(open(path_to_annotation_file)) except Exception, exc: sys.exit( 'failed to open %r:\n%s' % (path_to_annotation_file, traceback.format_exc(exc)) ) for target_id in native_annotation.keys(): if reject(target_id): log('excluding truth data for %s' % target_id) native_annotation.pop(target_id) ## invert the annotation file to have a stream_id index pointing ## to target_ids point to slot_types pointing to slot fills, ## instead of the reverse # stream_id --> target_id --> slot_types annotation = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) unofficial_slots = ['SignificantOther', 'Children'] for target_id, slots in native_annotation.items(): for slot_type, fills in slots.items(): if slot_type_filter and slot_type != slot_type_filter: log('excluding truth data for %s' % slot_type) continue
def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' if args.include_neutral: thresh = 0 elif args.include_useful: thresh = 1 else: thresh = 2 ## Load in the annotation data annotation = load_annotation( args.annotation, thresh, args.min_len_clean_visible, reject, require_positives=args.require_positives, any_up=args.any_up, restricted_entity_list=args.restricted_entity_list, ) log('This assumes that all run file names end in .gz') #import gc #from guppy import hpy #hp = hpy() run_count = 0 team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): continue if args.run_name_filter and not run_file.startswith( args.run_name_filter): continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log('processing: %s.gz' % run_file_name) try: max_scores = process_run(args, run_file_name, annotation, description, thresh) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores except Exception, exc: logger.critical('died on %s', run_file_name, exc_info=True) sys.exit(str(exc)) #gc.collect() #log(str(hp.heap())) run_count += 1
accepted_target_ids = set() if args.group or args.entity_type: if not args.topics_path: sys.exit('must specify --topics-path to use --group') targets = json.load(open(args.topics_path))['targets'] for targ in targets: if ('group' in targ and targ.get('group') == args.group) or targ['entity_type'] == args.entity_type: accepted_target_ids.add(targ['target_id']) if args.restricted_entity_list: args.restricted_entity_list = set( open(args.restricted_entity_list).read().splitlines()) log('loaded %d entities into restricted_entity_list:\n%s' % (len(args.restricted_entity_list), '\n'.join( args.restricted_entity_list))) description = make_description(args) ## construct reject callable def reject(target_id): if args.reject_twitter and 'twitter.com' in target_id: return True if args.reject_wikipedia and 'wikipedia.org' in target_id: return True if args.group or args.entity_type: if target_id not in accepted_target_ids: return True ## i.e. reject it return False
def score_confusion_matrix_DOCS(run_file_handle, annotation, positives, cutoff_step_size=50, unannotated_is_TN=False, debug=False): ''' read a run submission and generate a confusion matrix (number of true/false positives and true/false negatives) for DOCS mode evaluation. Generate a confusion matrix for each cutoff step and each mode. run_file_handle: str, a filesystem link to the run submission annotation: dict, containing the annotation data cutoff_step_size: int, increment between cutoffs unannotated_is_TN: boolean, true to count unannotated as negatives returns a confusion matrix dictionary for each target_id ''' ## Create a dictionary containing the confusion matrix (CM) cutoffs = range(0, 999, cutoff_step_size) def init_confusion_matrix(): return dict(TP=0, FP=0, FN=0, TN=0) ## confusion matrix is mode-->target_id-->cutoff-->2-by-2 matrix CM = {mode: defaultdict(lambda: defaultdict(init_confusion_matrix)) for mode in MODES} ## count the total number of assertions per entity num_assertions = {} ## keep assertions that are in the annotation set, because this is ## much smaller than the entire run submission. We will pass this ## to the four evaluation steps beyond DOCS. DOCS_TPs = list() ## Iterate through every row of the run for onerow in run_file_handle: ## Skip Comments if onerow.startswith('#') or len(onerow.strip()) == 0: continue row = onerow.split() assert len(row) == 11, row try: stream_id = row[2] timestamp = int(stream_id.split('-')[0]) target_id = row[3] conf = int(float(row[4])) rating = int(row[5]) contains_mention = int(row[6]) date_hour = row[7] slot_type = row[8] equiv_id = row[9] start_byte, end_byte = row[10].split('-') start_byte = int(start_byte) end_byte = int(end_byte) except Exception, exc: print repr(row) sys.exit(traceback.format_exc(exc)) if target_id not in num_assertions: num_assertions[target_id] = {'total': 0, 'is_annotated_TP': 0} ## keep track of total number of assertions per entity num_assertions[target_id]['total'] += 1 ## all modes start with DOCS, so is_annotated_TP means that ## the system has a DOCS-TP above some conf threshold is_annotated_TP = False if stream_id in annotation: if target_id in annotation[stream_id]: if slot_type in annotation[stream_id][target_id]: is_annotated_TP = True rec = (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, equiv_id, start_byte, end_byte) DOCS_TPs.append( rec ) log('TP: %r' % (rec,)) if is_annotated_TP: num_assertions[target_id]['is_annotated_TP'] += 1 increment_CM(is_annotated_TP, conf, cutoffs, CM, DOCS, target_id, unannotated_is_TN)
def ssf_runs(args): ''' yield file handles for all of the SSF runs ''' log( 'This assumes that all run file names end in .gz' ) run_count = 0 for run_file_name in os.listdir(args.run_dir): if not run_file_name.endswith('.gz'): log( 'ignoring: %s' % run_file_name ) continue if args.run_name_filter and not run_file_name.startswith(args.run_name_filter): log( 'ignoring: %s' % run_file_name) continue ## Open the run file run_file_path = os.path.join(args.run_dir, run_file_name) if run_file_path.endswith('.gz'): run_file_handle = gzip.open(run_file_path, 'r') else: run_file_handle = open(run_file_path, 'r') first_line = run_file_handle.readline() assert first_line.startswith('#') try: filter_run = json.loads(first_line[1:]) except: sys.exit('failed to get JSON out of: %r' % first_line[1:]) ### many CCR runs, including some from organizers have task_id ### set to SSF :-(, so we must detect this. ## read to first non-comment line second_line = None while not second_line: second_line = run_file_handle.readline() if second_line.strip().startswith('#'): second_line = None if 'NULL' in second_line or filter_run['task_id'] != 'kba-ssf-2013': log( 'ignoring non-SSF run: %s' % run_file_name ) continue ## Open run file again now that we verified it is SSF run_file_path = os.path.join(args.run_dir, run_file_name) if run_file_path.endswith('.gz'): run_file_handle = gzip.open(run_file_path, 'r') else: run_file_handle = open(run_file_path, 'r') log( 'processing: %s' % run_file_name ) log( json.dumps(filter_run, indent=4, sort_keys=True) ) yield run_file_name, run_file_handle
def score_confusion_matrix_OVERLAP(CM, DOCS_TPs, annotation, positives, cutoff_step_size=50, unannotated_is_TN=False, debug=False): ''' construct OVERLAP_TPs by excluding from DOCS_TPs those assertions that do not overlap any string identified by an assessor ''' cutoffs = range(0, 999, cutoff_step_size) OVERLAP_TPs = dict() log('considering %d unique OVERLAP assertions' % len(DOCS_TPs)) for rec in DOCS_TPs: (stream_id, target_id, conf, rating, contains_mention, date_hour, slot_type, runs_equiv_id, start_byte, end_byte) = rec if positives[OVERLAP].get(target_id, 0) == 0: log('ignoring assertion on entity for which no OVERLAP positives are known: %s' % target_id) continue start_byte = int(start_byte) end_byte = int(end_byte) for true_equiv_id, equiv_class in annotation[stream_id][target_id][slot_type].items(): offsets = equiv_class['stream_ids'][stream_id][1] overlaps = False for offset in offsets: assert isinstance(offset[0], int) assert isinstance(offset[1], int) ## we could/should be much stricter here, 10x is a big window true_len = offset[1] - offset[0] runs_len = end_byte - start_byte if start_byte <= offset[1] and end_byte >= offset[0] and runs_len < 10 * true_len: overlaps = True break #log('(%d, %d) compared to offsets %r\n' % (start_byte, end_byte, offsets)) if not overlaps: increment_CM(False, conf=conf, cutoffs=cutoffs, CM=CM, mode=OVERLAP, target_id=target_id, unannotated_is_TN=unannotated_is_TN) #log('found one!! system equiv_id (%r) --> assessors equiv_id (%r)' # % (runs_equiv_id, true_equiv_id)) rec = list(rec) rec[7] = (runs_equiv_id, true_equiv_id) rec = tuple(rec) assertion_key = (stream_id, target_id, slot_type, start_byte, end_byte) if assertion_key in OVERLAP_TPs: other_row = OVERLAP_TPs[assertion_key] if other_row[4] > conf: log('ignoring a duplicate row with lower conf: %d > %d' % (other_row[4], conf)) continue OVERLAP_TPs[assertion_key] = rec increment_CM(True, conf=conf, cutoffs=cutoffs, CM=CM, mode=OVERLAP, target_id=target_id, unannotated_is_TN=unannotated_is_TN) correct_FN(CM, OVERLAP, positives) if OVERLAP_TPs: assert CM[OVERLAP] OVERLAP_TPs = OVERLAP_TPs.values() return CM, OVERLAP_TPs
def load_annotation(path_to_annotation_file, thresh, min_len_clean_visible, reject, require_positives=False, any_up=False, restricted_entity_list=None): '''Loads the annotation file into a dict path_to_annotation_file: string filesystem path to the annotation file include_useful: true to include docs marked useful and vital :param min_len_clean_visible: minimum length of the clean_visible, which is in the 12 column of the expanded truth data file :param reject: callable that returns boolean given a target_id :param require_positives: if set to True, reject any target entity for which no true positives exist. :param restricted_entity_list: a list of target_id strings that are the only ones allowed in the annotation. ''' assert -1 <= thresh <= 2, thresh annotation_file = csv.reader(open(path_to_annotation_file, 'r'), delimiter='\t') annotation = dict() for row in annotation_file: ## Skip comments if row[0][0] == "#": continue stream_id = row[2] target_id = row[3] rating = int(row[5]) assert -1 <= rating <= 2, rating if len(row) == 12: ## only the later versions of the truth data carried this ## twelve column for excluding documents with insufficient ## clean_visible to be judged. We use a default cutoff of ## 100 bytes which means removing these counts below: # (stream_id, target_id) pairs: 34921 above, and 15767 below 100 bytes of clean_visible # (assessor_id, stream_id, target_id) pairs: 47446 above, and 19948 below 100 bytes of clean_visible len_clean_visible = int(row[11]) if len_clean_visible < min_len_clean_visible: log('excluding stream_id=%s for len(clean_visible)=%d' % (stream_id, len_clean_visible)) continue if reject(target_id): log('excluding truth data for %s' % target_id) continue if restricted_entity_list and target_id not in restricted_entity_list: log('not in restricted_entity_list: %s' % target_id) continue ## Add the stream_id and target_id to a hashed dictionary ## 0 means that its not vital 1 means that it is vital if (stream_id, target_id) in annotation: ## if rating is below threshold, then some assessor viewed ## it as not good enough, so be conservative and downgrade if not any_up and rating < thresh: ## default any_up=False means that if *any* assessor ## voted *against* the assertion, then *exclude* it annotation[(stream_id, target_id)] = False elif any_up and rating >= thresh: ## any_up means that if *any* assessor voted *for* the ## assertion, then *include* it annotation[(stream_id, target_id)] = True else: ## store bool values in the annotation index annotation[(stream_id, target_id)] = rating >= thresh has_true = Counter() for (stream_id, target_id), is_true in annotation.items(): if is_true: has_true[target_id] += 1 if require_positives: for stream_id, target_id in annotation.keys(): if has_true[target_id] < require_positives: log('rejecting %s for too few true positives: %d < %d = require_positives'\ % (target_id, has_true[target_id], require_positives)) annotation.pop((stream_id, target_id)) log('%d target_ids have at least one true positive' % len(has_true)) num_true = sum(map(int, annotation.values())) log('loaded annotation to create a dict of %d (stream_id, target_id) pairs with %d True' % (len(annotation), num_true)) if num_true == 0: sys.exit('found no true positives given the filters') return annotation