def _text_for_offsets(text, offsets): """ Given a text and a list of (start, end) integer offsets, returns the (catenated) text corresponding to those offsets, joined appropriately for use in a TextBoundAnnotation(WithText). """ try: return DISCONT_SEP.join(text[s:e] for s,e in offsets) except Exception: Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets)) raise ProtocolArgumentError
def _text_for_offsets(text, offsets): """Given a text and a list of (start, end) integer offsets, returns the (catenated) text corresponding to those offsets, joined appropriately for use in a TextBoundAnnotation(WithText).""" try: return DISCONT_SEP.join(text[s:e] for s, e in offsets) except Exception: Messager.error( '_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets)) raise ProtocolArgumentError
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # For event types, reuse trigger if a matching one exists. found = None if projectconf.is_event_type(type): for tb_ann in ann_obj.get_textbounds(): try: if (_offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') # XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() text_span = _text_for_offsets(text, offsets) # The below code resolves cases where there are newlines in the # offsets by creating discontinuous annotations for each span # separated by newlines. For most cases it preserves the offsets. seg_offsets = [] for o_start, o_end in offsets: pos = o_start for text_seg in text_span.split('\n'): if not text_seg and o_start != o_end: # Double new-line, skip ahead pos += 1 continue start = pos end = start + len(text_seg) # For the next iteration the position is after the newline. pos = end + 1 # Adjust the offsets to compensate for any potential leading # and trailing whitespace. start += len(text_seg) - len(text_seg.lstrip()) end -= len(text_seg) - len(text_seg.rstrip()) # If there is any segment left, add it to the offsets. if start != end: seg_offsets.append((start, end, )) # if we're dealing with a null-span if not seg_offsets: seg_offsets = offsets ann_text = DISCONT_SEP.join((text[start:end] for start, end in seg_offsets)) ann = TextBoundAnnotationWithText(seg_offsets, new_id, type, ann_text) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') # XXX: Cons event = EventAnnotation( ann.id, [], str(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event