Example #1
def format_strip(text, word_regex, byte_offsets=None):
    """Remove formatting for HTML rendering
    Called from KWIC only"""
    removed_from_start = 0
    begin = BEGIN_MATCH.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = START_CUTOFF_MATCH.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = END_MATCH.search(text)
    if end:
        text = text[:end.start(0)]
    if byte_offsets is not None:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + b"<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text.decode('utf8', 'ignore'))
    output = clean_tags(xml, word_regex)
    ## remove spaces around hyphens and apostrophes
    output = SPACE_MATCH.sub('\\1', output)
    return output
Example #2
def format_strip(text, byte_offsets=None):
    """Remove formatting for HTML rendering
    Called from: -kwic.py
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if byte_offsets is not None:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = ""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text)
    output = clean_tags(xml)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    return output
Example #3
def format_concordance(text_in_utf8, word_regex, byte_offsets=[]):
    removed_from_start = 0
    begin = BEGIN_MATCH.search(text_in_utf8)
    if begin:
        removed_from_start = len(begin.group(0))
        text_in_utf8 = text_in_utf8[begin.end(0):]
    start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text_in_utf8 = text_in_utf8[start_cutoff.end(0):]
    end = END_MATCH.search(text_in_utf8)
    if end:
        text_in_utf8 = text_in_utf8[:end.start(0)]
    if byte_offsets:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text_in_utf8):
                new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>"
                last_offset = b
        text_in_utf8 = new_text + text_in_utf8[last_offset:]
    text = text_in_utf8.decode('utf8', 'ignore')
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in VALID_HTML_TAGS:
            el = xml_to_html_class(el)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = SPACE_MATCH.sub('\\1', output)
    output = convert_entities(output)
    output = STRIP_START_PUNCTUATION.sub("", output)
    return output
Example #4
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root:
                    if "facs" in el.attrib or "id" in el.attrib:
                        if "facs" in el.attrib:
                            img = el.attrib["facs"]
                            img = el.attrib["id"]
                        img_split = img.split()
                        el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        if len(img_split) == 2:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        el[-1].text = "[page " + el.attrib["n"] + "]"
                        if config.external_page_images:
                            el[-1].attrib["target"] = "_blank"
                            el[-1].attrib['class'] = "page-image-link"
                            el[-1].attrib['data-gallery'] = ''
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images
Example #5
def format_concordance(text, word_regex, bytes=[]):
    word_regex = r"\w+"  # text is converted to unicode so we use the \w boundary to match
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text)
    allowed_tags = set([
        'philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i',
        'sc', 'scx', 'br'
    text = u''
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = convert_entities(output)
    output = strip_start_punctuation.sub("", output)
    return output
Example #6
        census = TagCensus()

        if not quiet:
                print(census, file=sys.stderr)
            except UnicodeEncodeError:
                print(str(census).encode("utf-8"), file=sys.stderr)

        if total:
            total += census
            total = census

        ## First round of cleanups
        xml = FragmentParserParse(text)
        file_contents = etree.tostring(xml).decode("utf8", "ignore")
        file_contents = convert_remaining_entities(file_contents, quiet)

        # Tag replacements
            parser = etree.XMLParser(huge_tree=True,
            tree = etree.fromstring(file_contents, parser=parser)
            for el in tree.iter():
                ## Tags are defined as el.tag, so to change tag, you do: el.tag = "some_other_tag"
                ## Attributes are contained in el.attrib where each attribute is a key. To change the type attribute you do: el.attrib['type'] = "some_other_type"
                if el.tag in xml_tag_mapping:  ## Check if the tag should be replaced according to the xml mapping dict
                    el.tag = xml_tag_mapping[el.tag]
            file_contents = etree.tostring(tree,