def factory(resource_type, html, casebook=None, ordinals=None): # create casebook, resource, resource_target, and annotations if not casebook: casebook = CasebookFactory() SectionFactory(casebook=casebook, ordinals=[1], display_ordinals=[1]) ordinals = [1, 1] resource_target = { 'TextBlock': TextBlockFactory, 'LegalDocument': LegalDocumentFactory }[resource_type](content=html) resource_type = resource_type resource = ResourceFactory(casebook=casebook, ordinals=ordinals, display_ordinals=ordinals, resource_type=resource_type, resource=resource_target) # retrieve the processed, cleansed html of the saved resource processed_html = resource.resource.content # strip html tags, break apart the text, and get annotation brackets and offsets text = re.sub(r'<[^>]+?>', '', processed_html) html_strs, annotation_offsets, annotation_strs = re_split_offsets( r'\[/?.+?\]', text) # resave the resource's content with the annotating brackets stripped content = re.sub(r'\[.*?\]', '', processed_html) resource.resource.content = content resource.resource.save() # create each annotation. to support overlapping annotations, pop off each starting annotation and then find # the nearest ending annotation: annotations = list(zip(annotation_strs, annotation_offsets)) while annotations: annotation_str, annotation_offset = annotations.pop(0) kind, content = (annotation_str[1:-1].split(" ", 1) + [None])[:2] closing_annotation_str = f'[/{kind}]' try: closing_index = next( i for i in range(len(annotations)) if annotations[i][0] == closing_annotation_str) except StopIteration: raise Exception( f"Closing annotation {closing_annotation_str} not found.") _, closing_annotation_offset = annotations.pop(closing_index) ContentAnnotationFactory( resource=resource, kind=kind, content=content, global_start_offset=annotation_offset, global_end_offset=closing_annotation_offset) return casebook, resource
def factory(resource_type, html, casebook=None, ordinals=None): # break apart provided html and get annotation brackets and offsets content = re.sub(r'\[.*?\]', '', html) # strip brackets html = re.sub(r'<[^>]+?>', '', html) # strip html tags html_strs, annotation_offsets, annotation_strs = re_split_offsets( r'\[/?.+?\]', html) # create casebook, resource, resource_target, and annotations if not casebook: casebook = CasebookFactory() SectionFactory(casebook=casebook, ordinals=[1]) ordinals = [1, 1] resource_target = { 'Case': CaseFactory, 'TextBlock': TextBlockFactory }[resource_type](content=content) resource = ResourceFactory(casebook=casebook, ordinals=ordinals, resource_type=resource_type, resource_id=resource_target.id) # create each annotation. to support overlapping annotations, pop off each starting annotation and then find # the nearest ending annotation: annotations = list(zip(annotation_strs, annotation_offsets)) while annotations: annotation_str, annotation_offset = annotations.pop(0) kind, content = (annotation_str[1:-1].split(" ", 1) + [None])[:2] closing_annotation_str = '[/%s]' % kind try: closing_index = next( i for i in range(len(annotations)) if annotations[i][0] == closing_annotation_str) except StopIteration: raise Exception("Closing annotation %s not found." % closing_annotation_str) _, closing_annotation_offset = annotations.pop(closing_index) ContentAnnotationFactory( resource=resource, kind=kind, content=content, global_start_offset=annotation_offset, global_end_offset=closing_annotation_offset) return casebook, resource
def dump_annotated_text(case_or_textblock): """ Return an annotated Case or TextBlock as html with annotation [brackets]. Example: >>> annotations_factory, *_ = [getfixture(f) for f in ['annotations_factory']] >>> html = '<p>[replace]This[/replace] [highlight]is[/highlight] [elide]a[/elide] [note]case[/note].</p>' >>> casebook, case = annotations_factory('Case', html) >>> assert dump_annotated_text(case) == html """ text_strs, offsets, tags = re_split_offsets(r'<[^>]+?>', case_or_textblock.resource.content) to_insert = list(zip(offsets, tags)) for annotation in case_or_textblock.annotations.filter(global_start_offset__gte=0): to_insert.extend([ (annotation.global_start_offset, '[%s]' % annotation.kind), (annotation.global_end_offset, '[/%s]' % annotation.kind), ]) content = "".join(text_strs) for offset, text in sorted(to_insert, reverse=True): content = content[:offset] + text + content[offset:] return content