Beispiel #1
0
def format_concordance(text, word_regex, bytes=[]):
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    removed_from_end = 0
    end = end_match.search(text)
    if end:
        removed_from_end = len(end.group(0))
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = f.FragmentParser.parse(text)
    length = 0
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    text = u''
    for el in xml.iter():
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib:  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        if el.tag == "philoHighlight":        
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    return output
Beispiel #2
0
def format_text_object(obj, text, config, q, word_regex, bytes=[], note=False):
    philo_id = obj.philo_id
    if bytes:
        new_text = ""
        last_offset = 0
        for b in bytes:
            new_text += text[last_offset:b] + "<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    first_img = ''
    current_obj_img = []
    text = "<div>" + text + "</div>"
    xml = f.FragmentParser.parse(text)
    for el in xml.iter():
        try:
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "ptr" or el.tag == "ref":
                target = el.attrib["target"]
                link = f.link.make_absolute_query_link(config, q, script_name="/scripts/get_notes.py", target=target)
                el.attrib["data-ref"] = link
                el.attrib["id"] = target.replace('#', '') + '-link-back'
                del el.attrib["target"]
                el.attrib['class'] = "note-ref"
                el.attrib['tabindex'] = "0"
                el.attrib['data-toggle'] = "popover"
                el.attrib['data-container'] = "body"
                el.attrib["data-placement"] = "right"
                el.attrib["data-trigger"] = "focus"
                el.attrib["data-html"] = "true"
                el.attrib["data-animation"] = "true"
                el.text = "note"
                el.tag = "span"
            elif el.tag == "note":
                if el.getparent().attrib["type"] != "notes": ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"
                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i,child in enumerate(parent):
                        if child == el:
                            attribs = {"class":"note", "tabindex": "0", "data-toggle": "popover", "data-container": "body",
                                       "data-placement": "right", "data-trigger": "focus"}
                            parent.insert(i,etree.Element("a",attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
                else: # endnotes
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    note_id = '#' + el.attrib['id']
                    link_back = etree.Element("a")
                    link_back.attrib['note-link-back'] = f.link.make_absolute_query_link(config, q, script_name="/scripts/get_note_link_back.py",
                                                                               doc_id=str(philo_id[0]), note_id=note_id)
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "pb" and "n" in el.attrib:
                if "fac" in el.attrib or "id" in el.attrib:
                    if "fac" in el.attrib:
                        img = el.attrib["fac"]
                    else:
                        img = el.attrib["id"]
                    current_obj_img.append(img)
                    el.tag = "p"
                    el.append(etree.Element("a"))
                    el[-1].attrib["href"] = config.page_images_url_root + '/' + img
                    el[-1].text = "[page " + el.attrib["n"] + "]"
                    el[-1].attrib['class'] = "page-image-link"
                    el[-1].attrib['data-gallery'] = ''
            elif el.tag == "figure":
                if el[0].tag == "graphic":
                    img_url = el[0].attrib["url"].replace(":","_")
                    volume = re.match("\d+", img_url).group()
                    url_prefix = config.page_images_url_root + '/V' + volume + "/plate_"
                    el.tag = "span"
                    el.attrib["href"] = url_prefix + img_url + ".jpeg"
                    el[0].tag = "img"
                    el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg"
                    el[0].attrib["class"] = "inline-img"
                    el.attrib["class"] = "inline-img-container"
                    del el[0].attrib["url"]
                    clear_float = etree.Element("span")
                    clear_float.attrib['style'] = 'clear:both;'
                    el[0].append(clear_float)
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in valid_html_tags:
                el = xml_to_html_class(el)
        except:
            pass
    output = etree.tostring(xml)
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')

    if note: ## Notes don't need to fetch images
        return (output, {})

    ## Page images
    output, img_obj = page_images(config, output, current_obj_img, philo_id)

    return output, img_obj
Beispiel #3
0
def format_text_object(text, config, q, word_regex, bytes=[]):
    if bytes:
        new_text = ""
        last_offset = 0
        for b in bytes:
            new_text += text[last_offset:b] + "<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    text = "<div>" + text + "</div>"
    xml = f.FragmentParser.parse(text)
    for el in xml.iter():        
        try:
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
                el.append(etree.Element("br"))
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "ptr" or el.tag == "ref":
                target = el.attrib["target"]
                link = f.link.make_absolute_query_link(config, q, script_name="/scripts/get_notes.py", target=target)
                el.attrib["data-ref"] = link
                del el.attrib["target"]
                el.attrib['class'] = "note-ref"
                el.attrib['tabindex'] = "0"
                el.attrib['data-toggle'] = "popover"
                el.attrib['data-container'] = "body"
                el.attrib["data-placement"] = "right"
                el.attrib["data-trigger"] = "focus"
                el.attrib["data-html"] = "true"
                el.attrib["data-animation"] = "true"
                el.text = "note"
                el.tag = "span"
            elif el.tag == "note" and el.getparent().attrib["type"] != "notes":
                el.tag = 'span'
                el.attrib['class'] = "note-content"
                for child in el:
                    child = note_content(child)
                # insert an anchor before this element by scanning through the parent
                parent = el.getparent()
                for i,child in enumerate(parent):
                    if child == el:
                        attribs = {"class":"note", "tabindex": "0", "data-toggle": "popover", "data-container": "body",
                                   "data-placement": "right", "data-trigger": "focus"}
                        parent.insert(i,etree.Element("a",attrib=attribs))
                        new_anchor = parent[i]
                        new_anchor.text = "note"

            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "pb" and "fac" in el.attrib and "n" in el.attrib:
                el.tag = "p"
                el.append(etree.Element("a"))
                el[-1].attrib["href"] = 'http://artflx.uchicago.edu/images/encyclopedie/' + el.attrib["fac"]
                el[-1].text = "[page " + el.attrib["n"] + "]"
                el[-1].attrib['class'] = "page-image-link"
                el[-1].attrib['data-gallery'] = ''
            elif el.tag == "figure":
                if el[0].tag == "graphic":
                    img_url = el[0].attrib["url"].replace(":","_")
                    volume = re.match("\d+",img_url).group()
                    url_prefix = "http://artflx.uchicago.edu/images/encyclopedie/V" + volume + "/plate_"
                    el.tag = "a"
                    el.attrib["href"] = url_prefix + img_url + ".jpeg"
                    el[0].tag = "img"
                    el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg"
                    el[0].attrib["class"] = "plate_img"
                    el.attrib["class"] = "plate-image-link"
                    el.attrib['data-gallery'] = ''
                    del el[0].attrib["url"]
                    el.append(etree.Element("br"))
            elif el.tag == "philoHighlight":        
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in valid_html_tags:
                el = xml_to_html_class(el)
        except:
            pass
    output = etree.tostring(xml)
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    return convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')