for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception, e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try:
def get_statistics(directory, base_names, use_cache=True): # Check if we have a cache of the costly satistics generation # Also, only use it if no file is newer than the cache itself cache_file_path = get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as e: if e.errno == 2: cache_mtime = -1 else: raise try: if (not isfile(cache_file_path) # Has config.py been changed? or getmtime(get_config_py_path()) > cache_mtime # Any file has changed in the dir since the cache was generated or any(True for f in listdir(directory) if (getmtime(path_join(directory, f)) > cache_mtime # Ignore hidden files and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True if not use_cache: generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: # Generate the document statistics from scratch from annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except BaseException: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception as e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError as e: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, e)) return stat_types, docstats
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the str() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append( [str(event_ann.id), str(event_ann.trigger), event_ann.args]) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append([ str(rel_ann.id), str(rel_ann.type), [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)] ]) for tb_ann in ann_obj.get_textbounds(): #j_tb = [str(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end] j_tb = [str(tb_ann.id), tb_ann.type, tb_ann.spans] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if str(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) # special case for BioNLP ST 2013 format: send triggers # also as entities for those triggers that are referenced # from annotations other than events (#926). if BIONLP_ST_2013_COMPATIBILITY: if tb_ann.id in ann_obj.externally_referenced_triggers: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] else: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities])) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append([ str(att_ann.id), str(att_ann.type), str(att_ann.target), att_ann.value ]) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append([ str(norm_ann.id), str(norm_ann.type), str(norm_ann.target), str(norm_ann.refdb), str(norm_ann.refid), str(norm_ann.reftext) ]) for com_ann in ann_obj.get_oneline_comments(): comment = [ str(com_ann.target), str(com_ann.type), com_ann.tail.strip() ] try: j_dic['comments'].append(comment) except KeyError: j_dic['comments'] = [ comment, ] if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join([ ( '%s: %s' % ( # The line number is off by one str(line_num + 1), str(ann_obj[line_num]))).strip() for line_num in ann_obj.failed_lines ])) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) if options_get_validation(docdir) in ( 'all', 'full', ): from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1) for i in issues: issue = (str(i.ann_id), i.type, i.description) try: j_dic['comments'].append(issue) except BaseException: j_dic['comments'] = [ issue, ] # Attach the source files for the annotations and text from os.path import splitext from annotation import TEXT_FILE_SUFFIX ann_files = [splitext(p)[1][1:] for p in ann_obj._input_files] ann_files.append(TEXT_FILE_SUFFIX) ann_files = sorted([p for p in set(ann_files)]) j_dic['source_files'] = ann_files
def get_annotator_config(directory): # TODO: "annotator" is a very confusing term for a web service # that does automatic annotation in the context of a tool # where most annotators are expected to be human. Rethink. return ProjectConfiguration(directory).get_annotator_config()
def get_normalization_config(directory): return ProjectConfiguration(directory).get_normalization_config()
def get_disambiguator_config(directory): return ProjectConfiguration(directory).get_disambiguator_config()
def get_search_config(directory): return ProjectConfiguration(directory).get_search_config()