'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError, e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: # Generate the document statistics from scratch from annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()])
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append([ unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args ]) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append([ unicode(rel_ann.id), unicode(rel_ann.type), [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)] ]) for tb_ann in ann_obj.get_textbounds(): #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end] j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) # special case for BioNLP ST 2013 format: send triggers # also as entities for those triggers that are referenced # from annotations other than events (#926). if BIONLP_ST_2013_COMPATIBILITY: if tb_ann.id in ann_obj.externally_referenced_triggers: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] else: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities])) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append([ unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value ]) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append([ unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext) ]) for com_ann in ann_obj.get_oneline_comments(): comment = [ unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip() ] try: j_dic['comments'].append(comment) except KeyError: j_dic['comments'] = [ comment, ] if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join([ ( '%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]))).strip() for line_num in ann_obj.failed_lines ])) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) if options_get_validation(docdir) in ( 'all', 'full', ): from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append( [unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args] ) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append( [unicode(rel_ann.id), unicode(rel_ann.type), [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)]] ) for tb_ann in ann_obj.get_textbounds(): #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end] j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) # special case for BioNLP ST 2013 format: send triggers # also as entities for those triggers that are referenced # from annotations other than events (#926). if BIONLP_ST_2013_COMPATIBILITY: if tb_ann.id in ann_obj.externally_referenced_triggers: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [j_tb, ] else: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [j_tb, ] for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities]) ) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append( [unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value] ) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append( [unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext)] ) for com_ann in ann_obj.get_oneline_comments(): comment = [unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip()] try: j_dic['comments'].append(comment) except KeyError: j_dic['comments'] = [comment, ] if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join( [('%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]) )).strip() for line_num in ann_obj.failed_lines]) ) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) if options_get_validation(docdir) in ('all', 'full', ): from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def get_statistics(directory, base_names, use_cache=True): # Check if we have a cache of the costly satistics generation # Also, only use it if no file is newer than the cache itself cache_file_path = get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as e: if e.errno == 2: cache_mtime = -1 else: raise try: if (not isfile(cache_file_path) # Has config.py been changed? or getmtime(get_config_py_path()) > cache_mtime # Any file has changed in the dir since the cache was generated or any(True for f in listdir(directory) if (getmtime(path_join(directory, f)) > cache_mtime # Ignore hidden files and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True if not use_cache: generate = True # "header" and types stat_types = [("实体", "int"), ("关系", "int"), ("事件", "int")] if options_get_validation(directory) != 'none': stat_types.append(("观点", "int")) stat_types.append(("修改者", "string")) if generate: # Generate the document statistics from scratch from annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: # 在这里获取实体,关系,事件,修改者。 with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) try: user = get_session().get('user') except KeyError: user = None if user is None: user = '******' if options_get_validation(directory) == 'none': docstats.append( [tb_count, rel_count, event_count, user]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except BaseException: # TODO: error reporting issue_count = -1 docstats.append([ tb_count, rel_count, event_count, issue_count, user ]) except Exception as e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) try: user = get_session().get('user') except KeyError: user = None if user is None: user = '******' # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError as e: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, e)) return stat_types, docstats
if generate: # Generate the document statistics from scratch from annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except: # TODO: error reporting issue_count = -1 docstats.append([tb_count, rel_count, event_count, issue_count]) except Exception, e: log_info('Received "%s" when trying to generate stats' % e)
def get_statistics(directory, base_names, use_cache=True): # Check if we have a cache of the costly satistics generation # Also, only use it if no file is newer than the cache itself cache_file_path = get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as e: if e.errno == 2: cache_mtime = -1 else: raise try: if (not isfile(cache_file_path) # Has config.py been changed? or getmtime(get_config_py_path()) > cache_mtime # Any file has changed in the dir since the cache was generated or any(True for f in listdir(directory) if (getmtime(path_join(directory, f)) > cache_mtime # Ignore hidden files and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating') generate = True if not use_cache: generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: # Generate the document statistics from scratch from annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except BaseException: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception as e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError as e: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, e)) return stat_types, docstats