Python parse Examples

Programming Language: Python

Namespace/Package Name: philologic.runtime.FragmentParser

Class/Type: parse

Examples at hotexamples.com: 6

Python parse - 6 examples found. These are the top rated real world Python examples of philologic.runtime.FragmentParser.parse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

iter(3)

Example #1

Show file

File: ObjectFormatter.py Project: clovis/PhiloLogic4

def format_concordance(text_in_utf8, word_regex, byte_offsets=[]):
    removed_from_start = 0
    begin = BEGIN_MATCH.search(text_in_utf8)
    if begin:
        removed_from_start = len(begin.group(0))
        text_in_utf8 = text_in_utf8[begin.end(0):]
    start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text_in_utf8 = text_in_utf8[start_cutoff.end(0):]
    end = END_MATCH.search(text_in_utf8)
    if end:
        text_in_utf8 = text_in_utf8[:end.start(0)]
    if byte_offsets:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text_in_utf8):
                new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>"
                last_offset = b
        text_in_utf8 = new_text + text_in_utf8[last_offset:]
    text = text_in_utf8.decode('utf8', 'ignore')
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in VALID_HTML_TAGS:
            el = xml_to_html_class(el)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = SPACE_MATCH.sub('\\1', output)
    output = convert_entities(output)
    output = STRIP_START_PUNCTUATION.sub("", output)
    return output

Example #2

Show file

File: ObjectFormatter.py Project: brown-ccv/PhiloLogic4

def format_concordance(text_in_utf8, word_regex, byte_offsets=[]):
    removed_from_start = 0
    begin = BEGIN_MATCH.search(text_in_utf8)
    if begin:
        removed_from_start = len(begin.group(0))
        text_in_utf8 = text_in_utf8[begin.end(0):]
    start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text_in_utf8 = text_in_utf8[start_cutoff.end(0):]
    end = END_MATCH.search(text_in_utf8)
    if end:
        text_in_utf8 = text_in_utf8[:end.start(0)]
    if byte_offsets:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text_in_utf8):
                new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>"
                last_offset = b
        text_in_utf8 = new_text + text_in_utf8[last_offset:]
    text = text_in_utf8.decode('utf8', 'ignore')
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in VALID_HTML_TAGS:
            el = xml_to_html_class(el)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = SPACE_MATCH.sub('\\1', output)
    output = convert_entities(output)
    output = STRIP_START_PUNCTUATION.sub("", output)
    return output

Example #3

Show file

File: ObjectFormatter.py Project: brown-ccv/PhiloLogic4

def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                    try:
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                        continue
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                                break
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root:
                    if "facs" in el.attrib or "id" in el.attrib:
                        if "facs" in el.attrib:
                            img = el.attrib["facs"]
                        else:
                            img = el.attrib["id"]
                        current_obj_img.append(img.split()[0])
                        el.append(etree.Element("a"))
                        img_split = img.split()
                        el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        if len(img_split) == 2:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                        else:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        el[-1].text = "[page " + el.attrib["n"] + "]"
                        if config.external_page_images:
                            el[-1].attrib["target"] = "_blank"
                        else:
                            el[-1].attrib['class'] = "page-image-link"
                            el[-1].attrib['data-gallery'] = ''
                else:
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                    else:
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    current_graphic_img.append(imgs[0])
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                    else:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images

Example #4

Show file

File: ObjectFormatter.py Project: clovis/PhiloLogic4

def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "h1" or el.tag == "h2":
                el.tag = "b"
                el.attrib["class"] = "headword"
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            if el.tag == "page":
                el.tag = "pb"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                    try:
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                        continue
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                                break
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root and "facs" in el.attrib or "id" in el.attrib:
                    if "facs" in el.attrib:
                        img = el.attrib["facs"]
                    else:
                        img = el.attrib["id"]
                    current_obj_img.append(img.split()[0])
                    el.append(etree.Element("a"))
                    img_split = img.split()
                    el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                    if len(img_split) == 2:
                        el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                    else:
                        el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                    el[-1].text = "[page " + el.attrib["n"] + "]"
                    if config.external_page_images:
                        el[-1].attrib["target"] = "_blank"
                    else:
                        el[-1].attrib['class'] = "page-image-link"
                        el[-1].attrib['data-gallery'] = ''
                else:
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                    else:
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    current_graphic_img.append(imgs[0])
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                    else:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "ptr":
                if "facs" in el.attrib and config.page_images_url_root:
                    el.tag = "a"
                    el.attrib["href"] = os.path.join(config.page_images_url_root, el.attrib["facs"])
                    el.text = el.attrib["rend"]
                    el.attrib["external-img"] = ""
                    el.attrib["class"] = "external-img"
                    el.attrib["large-img"] = el.attrib["href"]
                    del el.attrib["rend"]
                    del el.attrib["facs"]
                    el.attrib['data-gallery'] = ''
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images

Example #5

Show file

def format_concordance(text, word_regex, bytes=[]):
    word_regex = r"\w+"  # text is converted to unicode so we use the \w boundary to match
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text)
    allowed_tags = set([
        'philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i',
        'sc', 'scx', 'br'
    ])
    text = u''
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = convert_entities(output)
    output = strip_start_punctuation.sub("", output)
    return output

Example #6

Show file

def format_concordance(text, word_regex, bytes=[]):
    word_regex = r"\w+"  # text is converted to unicode so we use the \w boundary to match
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    text = u''
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    output = convert_entities(output)
    output = strip_start_punctuation.sub("", output)
    return output