def evaluate(system, gs, eval_class, **kwargs): """Evaluate the system by calling the eval_class (either EvaluatePHI or EvaluateCardiacRisk classes) with an annotation id indexed dict of StandoffAnnotation classes for the system(s) and the gold standard outputs. 'system' will be a list containing either one file, or one or more directories. 'gs' will be a file or a directory. This function mostly just handles formatting arguments for the eval_class. """ assert issubclass(eval_class, Evaluate) or \ issubclass(eval_class, CombinedEvaluation), \ "Must pass in EvaluatePHI or EvaluateCardiacRisk classes to evaluate()" gold_sa = {} evaluations = [] # Strip verbose keyword if it exists # verbose is not a keyword to our eval classes # __init__() functions try: verbose = kwargs['verbose'] del kwargs['verbose'] except KeyError: verbose = False assert os.path.exists(gs), "{} does not exist!".format(gs) for s in system: assert os.path.exists(s), "{} does not exist!".format(s) # Handle if two files were passed on the command line if os.path.isfile(system[0]) and os.path.isfile(gs): gs = StandoffAnnotation(gs) s = StandoffAnnotation(system[0]) e = eval_class({s.id: s}, {gs.id: gs}, **kwargs) e.print_docs() evaluations.append(e) # Handle the case where 'gs' is a directory and 'system' is a # list of directories. For individual evaluation (one system output # against the gold standard) this is a little overkill, but this # lets us run multiple systems against the gold standard and get numbers # for each system output. useful for annotator agreement and final system # evaluations. Error checking to ensure consistent files in each directory # will be handled by the evaluation class. elif all([os.path.isdir(s) for s in system]) and os.path.isdir(gs): # Get a dict of gold standoff annotation indexed by id for fn in os.listdir(gs): sa = StandoffAnnotation(gs + fn) gold_sa[sa.id] = sa for s_id, system_sa in get_document_dict_by_system_id(system).items(): e = eval_class(system_sa, gold_sa, **kwargs) e.print_report(verbose=verbose) evaluations.append(e) else: Exception("Must pass file.xml file.xml or [directory/]+ directory/" "on command line!") return evaluations[0] if len(evaluations) == 1 else evaluations
def get_document_dict_by_system_id(system_dirs): """Takes a list of directories and returns all of the StandoffAnnotation's as a system id, annotation id indexed dictionary. System id (or StandoffAnnotation.sys_id) is whatever values trail the XXX-YY file id. For example: 301-01foo.xml patient id: 301 document id: 01 system id: foo In the case where there is nothing trailing the document id, the sys_id is the empty string (''). """ documents = defaultdict(lambda: defaultdict(int)) for d in system_dirs: for fn in os.listdir(d): sa = StandoffAnnotation(d + fn) documents[sa.sys_id][sa.id] = sa return documents