def getAnnObject2(collection,document): '''newest version of the getAnnObject methode''' try: from os.path import join as path_join from document import real_directory real_dir = real_directory(collection) except: real_dir=collection app_path = WORK_DIR + "/application/" ann = None full_name = collection + document full_name = full_name.replace("/","") if( isfile(app_path+full_name)): temp=open (app_path+full_name , 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir+document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia=get_extra_info(collection,document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: import os import simplejson as json import session docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = [] issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] ann.issues = issues temp=open (app_path+full_name , 'wb') pickle_dump(ann, temp) temp.close() return ann
def getAnnObject(collection, document): try: real_dir = real_directory(collection) except: real_dir = collection app_path = WORK_DIR + "/application/" full_name = collection + document full_name = full_name.replace("/", "") if (os.path.isfile(app_path + full_name)): temp = open(app_path + full_name, 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir + document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia = get_extra_info(collection, document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] Messager.error('Error: validation failed: %s' % e) ann.issues = issues temp = open(app_path + full_name, 'wb') pickle_dump(ann, temp) temp.close() return ann
with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception, e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path.decode('utf-8').encode('utf-8'), 'wb') as cache_file:
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append([ unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args ]) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append([ unicode(rel_ann.id), unicode(rel_ann.type), [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)] ]) for tb_ann in ann_obj.get_textbounds(): #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end] j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) # special case for BioNLP ST 2013 format: send triggers # also as entities for those triggers that are referenced # from annotations other than events (#926). if BIONLP_ST_2013_COMPATIBILITY: if tb_ann.id in ann_obj.externally_referenced_triggers: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] else: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities])) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append([ unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value ]) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append([ unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext) ]) for com_ann in ann_obj.get_oneline_comments(): comment = [ unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip() ] try: j_dic['comments'].append(comment) except KeyError: j_dic['comments'] = [ comment, ] if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join([ ( '%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]))).strip() for line_num in ann_obj.failed_lines ])) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) if options_get_validation(docdir) in ( 'all', 'full', ): from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append( [unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args] ) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append( [unicode(rel_ann.id), unicode(rel_ann.type), [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)]] ) for tb_ann in ann_obj.get_textbounds(): #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end] j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) # special case for BioNLP ST 2013 format: send triggers # also as entities for those triggers that are referenced # from annotations other than events (#926). if BIONLP_ST_2013_COMPATIBILITY: if tb_ann.id in ann_obj.externally_referenced_triggers: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [j_tb, ] else: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [j_tb, ] for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities]) ) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append( [unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value] ) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append( [unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext)] ) for com_ann in ann_obj.get_oneline_comments(): comment = [unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip()] try: j_dic['comments'].append(comment) except KeyError: j_dic['comments'] = [comment, ] if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join( [('%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]) )).strip() for line_num in ann_obj.failed_lines]) ) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) if options_get_validation(docdir) in ('all', 'full', ): from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def get_statistics(directory, base_names, use_cache=True): # Check if we have a cache of the costly satistics generation # Also, only use it if no file is newer than the cache itself cache_file_path = get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as e: if e.errno == 2: cache_mtime = -1 else: raise try: if (not isfile(cache_file_path) # Has config.py been changed? or getmtime(get_config_py_path()) > cache_mtime # Any file has changed in the dir since the cache was generated or any(True for f in listdir(directory) if (getmtime(path_join(directory, f)) > cache_mtime # Ignore hidden files and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True if not use_cache: generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: # Generate the document statistics from scratch from .annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except BaseException: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception as e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError as e: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, e)) return stat_types, docstats
read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except: # TODO: error reporting issue_count = -1 docstats.append([tb_count, rel_count, event_count, issue_count]) except Exception, e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError, e:
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append( [unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args] ) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append( [unicode(rel_ann.id), unicode(rel_ann.type), rel_ann.arg1, rel_ann.arg2] ) for tb_ann in ann_obj.get_textbounds(): j_tb = [unicode(tb_ann.id), unicode(tb_ann.type), tb_ann.start, tb_ann.end] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) else: j_dic['entities'].append(j_tb) for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities]) ) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append( [unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value] ) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append( [unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext)] ) for com_ann in ann_obj.get_oneline_comments(): j_dic['comments'].append( [unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip()] ) if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join( [('%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]) )).strip() for line_num in ann_obj.failed_lines]) ) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: if PERFORM_VERIFICATION: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) projectconf = ProjectConfiguration(docdir) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append([ unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args ]) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append([ unicode(rel_ann.id), unicode(rel_ann.type), rel_ann.arg1, rel_ann.arg2 ]) for tb_ann in ann_obj.get_textbounds(): j_tb = [ unicode(tb_ann.id), unicode(tb_ann.type), tb_ann.start, tb_ann.end ] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) else: j_dic['entities'].append(j_tb) for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities])) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append([ unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value ]) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append([ unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext) ]) for com_ann in ann_obj.get_oneline_comments(): j_dic['comments'].append([ unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip() ]) if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join([ ( '%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]))).strip() for line_num in ann_obj.failed_lines ])) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: if PERFORM_VERIFICATION: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) projectconf = ProjectConfiguration(docdir) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def get_statistics(directory, base_names, use_cache=True): # Check if we have a cache of the costly satistics generation # Also, only use it if no file is newer than the cache itself cache_file_path = get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as e: if e.errno == 2: cache_mtime = -1 else: raise try: if (not isfile(cache_file_path) # Has config.py been changed? or getmtime(get_config_py_path()) > cache_mtime # Any file has changed in the dir since the cache was generated or any(True for f in listdir(directory) if (getmtime(path_join(directory, f)) > cache_mtime # Ignore hidden files and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating') generate = True if not use_cache: generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: # Generate the document statistics from scratch from annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except BaseException: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception as e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError as e: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, e)) return stat_types, docstats