def __create_span(ann_obj, mods, type, start, end, txt_file_path, projectconf, attributes): # TODO: Rip this out! start = int(start) end = int(end) # Before we add a new trigger, does it already exist? found = None for tb_ann in ann_obj.get_textbounds(): try: if (tb_ann.start == start and tb_ann.end == end and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') #XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read()[start:end] #TODO: Data tail should be optional if '\n' not in text: ann = TextBoundAnnotationWithText(start, end, new_id, type, text) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = None else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') #XXX: Cons event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # For event types, reuse trigger if a matching one exists. found = None if projectconf.is_event_type(type): for tb_ann in ann_obj.get_textbounds(): try: if (_offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') # XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() text_span = _text_for_offsets(text, offsets) # The below code resolves cases where there are newlines in the # offsets by creating discontinuous annotations for each span # separated by newlines. For most cases it preserves the offsets. seg_offsets = [] for o_start, o_end in offsets: pos = o_start for text_seg in text_span.split('\n'): if not text_seg and o_start != o_end: # Double new-line, skip ahead pos += 1 continue start = pos end = start + len(text_seg) # For the next iteration the position is after the newline. pos = end + 1 # Adjust the offsets to compensate for any potential leading # and trailing whitespace. start += len(text_seg) - len(text_seg.lstrip()) end -= len(text_seg) - len(text_seg.rstrip()) # If there is any segment left, add it to the offsets. if start != end: seg_offsets.append((start, end, )) # if we're dealing with a null-span if not seg_offsets: seg_offsets = offsets ann_text = DISCONT_SEP.join((text[start:end] for start, end in seg_offsets)) ann = TextBoundAnnotationWithText(seg_offsets, new_id, type, ann_text) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') # XXX: Cons event = EventAnnotation( ann.id, [], str(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) print("tagger", tagger, file=sys.stderr) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr) # print("tagger_token", tagger_token, file=sys.stderr) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: # print("ann_obj", document, file=sys.stderr) url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection # print("HTTPConnection", HTTPConnection, file=sys.stderr) elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: # Note: Trout slapping for anyone sending Unicode objects here data = str(path_join( real_directory(collection), document)) + "#*^$#" + ann_obj.get_document_text() data = data.encode('utf-8') # print("data", type(data),data, file=sys.stderr) # print("data", ann_obj, file=sys.stderr) req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) # httpConnection = http.client.HTTPConnection(url_soup.netloc) # httpConnection.request('GET', str(service_url), headers=req_headers) # response = httpConnection.getresponse() except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # print("resp-------------", resp.read(), file=sys.stderr) # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) # print("json_resp", json_resp, file=sys.stderr) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr) for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] # print("json_resp.items:::::::::::::", offsets, file=sys.stderr) assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr) _id = ann_obj.get_new_id('T') print("_id", _id, file=sys.stderr) cidmap[cid] = _id tb = TextBoundAnnotationWithText(offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
texts = ann_data['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' # Note: We do not support discontinuous spans at this point assert len( offsets ) < 2, 'Tagger response has multiple offsets (discontinuous spans not supported)' start, end = offsets[0] text = texts[0] _id = ann_obj.get_new_id('T') tb = TextBoundAnnotationWithText(start, end, _id, _type, text) mods.addition(tb) ann_obj.add_annotation(tb) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp if __name__ == '__main__': # Silly test, but helps tag('/BioNLP-ST_2011_ID_devel', 'PMC1874608-01-INTRODUCTION', 'random')
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # For event types, reuse trigger if a matching one exists. found = None if projectconf.is_event_type(type): for tb_ann in ann_obj.get_textbounds(): try: if (_offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') #XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: # TODO discont: use offsets instead (note need for int conversion) text = _text_for_offsets(txt_file.read(), offsets) # The below code resolves cases where there are newlines in the # offsets by creating discontinuous annotations for each span # separated by newlines. For most cases it preserves the offsets. seg_offsets = [] for o_start, o_end in offsets: pos = o_start for text_seg in text.split('\n'): if not text_seg and o_start != o_end: # Double new-line, skip ahead pos += 1 continue end = pos + len(text_seg) seg_offsets.append((pos, end)) # Our current position is after the newline pos = end + 1 ann = TextBoundAnnotationWithText( seg_offsets, new_id, type, # Replace any newlines with the discontinuous separator MUL_NL_REGEX.sub(DISCONT_SEP, text)) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') #XXX: Cons event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] _id = ann_obj.get_new_id('T') cidmap[cid] = _id tb = TextBoundAnnotationWithText(offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.itervalues() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError, e: raise # TODO _id = ann_obj.get_new_id('N')
def make_annotation(doc, accu): spans = [(accu[0][3], accu[-1][4])] label = accu[0][2] or "Entity" TextBoundAnnotationWithText(spans, doc.get_new_id('T'), label, doc)