Python re_split_offsets Examples, main.utils.re_split_offsets Python Examples

Example #1

0

Show file

File: conftest.py Project: harvard-lil/h2o

    def factory(resource_type, html, casebook=None, ordinals=None):
        # create casebook, resource, resource_target, and annotations
        if not casebook:
            casebook = CasebookFactory()
            SectionFactory(casebook=casebook,
                           ordinals=[1],
                           display_ordinals=[1])
            ordinals = [1, 1]
        resource_target = {
            'TextBlock': TextBlockFactory,
            'LegalDocument': LegalDocumentFactory
        }[resource_type](content=html)
        resource_type = resource_type
        resource = ResourceFactory(casebook=casebook,
                                   ordinals=ordinals,
                                   display_ordinals=ordinals,
                                   resource_type=resource_type,
                                   resource=resource_target)

        # retrieve the processed, cleansed html of the saved resource
        processed_html = resource.resource.content

        # strip html tags, break apart the text, and get annotation brackets and offsets
        text = re.sub(r'<[^>]+?>', '', processed_html)
        html_strs, annotation_offsets, annotation_strs = re_split_offsets(
            r'\[/?.+?\]', text)

        # resave the resource's content with the annotating brackets stripped
        content = re.sub(r'\[.*?\]', '', processed_html)
        resource.resource.content = content
        resource.resource.save()

        # create each annotation. to support overlapping annotations, pop off each starting annotation and then find
        # the nearest ending annotation:
        annotations = list(zip(annotation_strs, annotation_offsets))
        while annotations:
            annotation_str, annotation_offset = annotations.pop(0)
            kind, content = (annotation_str[1:-1].split(" ", 1) + [None])[:2]
            closing_annotation_str = f'[/{kind}]'
            try:
                closing_index = next(
                    i for i in range(len(annotations))
                    if annotations[i][0] == closing_annotation_str)
            except StopIteration:
                raise Exception(
                    f"Closing annotation {closing_annotation_str} not found.")
            _, closing_annotation_offset = annotations.pop(closing_index)
            ContentAnnotationFactory(
                resource=resource,
                kind=kind,
                content=content,
                global_start_offset=annotation_offset,
                global_end_offset=closing_annotation_offset)

        return casebook, resource

Example #2

0

Show file

    def factory(resource_type, html, casebook=None, ordinals=None):
        # break apart provided html and get annotation brackets and offsets
        content = re.sub(r'\[.*?\]', '', html)  # strip brackets
        html = re.sub(r'<[^>]+?>', '', html)  # strip html tags
        html_strs, annotation_offsets, annotation_strs = re_split_offsets(
            r'\[/?.+?\]', html)

        # create casebook, resource, resource_target, and annotations
        if not casebook:
            casebook = CasebookFactory()
            SectionFactory(casebook=casebook, ordinals=[1])
            ordinals = [1, 1]
        resource_target = {
            'Case': CaseFactory,
            'TextBlock': TextBlockFactory
        }[resource_type](content=content)
        resource = ResourceFactory(casebook=casebook,
                                   ordinals=ordinals,
                                   resource_type=resource_type,
                                   resource_id=resource_target.id)

        # create each annotation. to support overlapping annotations, pop off each starting annotation and then find
        # the nearest ending annotation:
        annotations = list(zip(annotation_strs, annotation_offsets))
        while annotations:
            annotation_str, annotation_offset = annotations.pop(0)
            kind, content = (annotation_str[1:-1].split(" ", 1) + [None])[:2]
            closing_annotation_str = '[/%s]' % kind
            try:
                closing_index = next(
                    i for i in range(len(annotations))
                    if annotations[i][0] == closing_annotation_str)
            except StopIteration:
                raise Exception("Closing annotation %s not found." %
                                closing_annotation_str)
            _, closing_annotation_offset = annotations.pop(closing_index)
            ContentAnnotationFactory(
                resource=resource,
                kind=kind,
                content=content,
                global_start_offset=annotation_offset,
                global_end_offset=closing_annotation_offset)

        return casebook, resource

Example #3

0

Show file

File: test_helpers.py Project: holtchesley/h2o

def dump_annotated_text(case_or_textblock):
    """
        Return an annotated Case or TextBlock as html with annotation [brackets]. Example:

        >>> annotations_factory, *_ = [getfixture(f) for f in ['annotations_factory']]
        >>> html = '<p>[replace]This[/replace] [highlight]is[/highlight] [elide]a[/elide] [note]case[/note].</p>'
        >>> casebook, case = annotations_factory('Case', html)
        >>> assert dump_annotated_text(case) == html
    """
    text_strs, offsets, tags = re_split_offsets(r'<[^>]+?>', case_or_textblock.resource.content)
    to_insert = list(zip(offsets, tags))
    for annotation in case_or_textblock.annotations.filter(global_start_offset__gte=0):
        to_insert.extend([
            (annotation.global_start_offset, '[%s]' % annotation.kind),
            (annotation.global_end_offset, '[/%s]' % annotation.kind),
        ])
    content = "".join(text_strs)
    for offset, text in sorted(to_insert, reverse=True):
        content = content[:offset] + text + content[offset:]
    return content