def format_concordance(text_in_utf8, word_regex, byte_offsets=[]): removed_from_start = 0 begin = BEGIN_MATCH.search(text_in_utf8) if begin: removed_from_start = len(begin.group(0)) text_in_utf8 = text_in_utf8[begin.end(0):] start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text_in_utf8 = text_in_utf8[start_cutoff.end(0):] end = END_MATCH.search(text_in_utf8) if end: text_in_utf8 = text_in_utf8[:end.start(0)] if byte_offsets: byte_offsets = [b - removed_from_start for b in byte_offsets] new_text = b"" last_offset = 0 for b in byte_offsets: if b > 0 and b < len(text_in_utf8): new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>" last_offset = b text_in_utf8 = new_text + text_in_utf8[last_offset:] text = text_in_utf8.decode('utf8', 'ignore') xml = FragmentParserParse(text) allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br']) for el in xml.iter(): if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag not in allowed_tags: el.tag = 'span' elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' if "id" in el.attrib and el.tag != "l": ## kill ids in order to avoid the risk of having duplicate ids in the HTML del el.attrib["id"] if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "img": # Remove img elements from parent in concordances el.getparent().remove(el) if el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in VALID_HTML_TAGS: el = xml_to_html_class(el) output = etree.tostring(xml).decode('utf8', 'ignore') output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) ## remove spaces around hyphens and apostrophes output = SPACE_MATCH.sub('\\1', output) output = convert_entities(output) output = STRIP_START_PUNCTUATION.sub("", output) return output
def format_concordance(text_in_utf8, word_regex, byte_offsets=[]): removed_from_start = 0 begin = BEGIN_MATCH.search(text_in_utf8) if begin: removed_from_start = len(begin.group(0)) text_in_utf8 = text_in_utf8[begin.end(0):] start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text_in_utf8 = text_in_utf8[start_cutoff.end(0):] end = END_MATCH.search(text_in_utf8) if end: text_in_utf8 = text_in_utf8[:end.start(0)] if byte_offsets: byte_offsets = [b - removed_from_start for b in byte_offsets] new_text = b"" last_offset = 0 for b in byte_offsets: if b > 0 and b < len(text_in_utf8): new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>" last_offset = b text_in_utf8 = new_text + text_in_utf8[last_offset:] text = text_in_utf8.decode('utf8', 'ignore') xml = FragmentParserParse(text) allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br']) for el in xml.iter(): if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag not in allowed_tags: el.tag = 'span' elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' if "id" in el.attrib and el.tag != "l": ## kill ids in order to avoid the risk of having duplicate ids in the HTML del el.attrib["id"] if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "img": # Remove img elements from parent in concordances el.getparent().remove(el) if el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in VALID_HTML_TAGS: el = xml_to_html_class(el) output = etree.tostring(xml).decode('utf8', 'ignore') output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) ## remove spaces around hyphens and apostrophes output = SPACE_MATCH.sub('\\1', output) output = convert_entities(output) output = STRIP_START_PUNCTUATION.sub("", output) return output
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True): """Format text objects""" philo_id = obj.philo_id if byte_offsets is not None: new_text = b"" last_offset = 0 for b in byte_offsets: new_text += text[last_offset:b] + b"<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] current_obj_img = [] current_graphic_img = [] text = "<div>" + text.decode('utf8', 'ignore') + "</div>" xml = FragmentParserParse(text) c = obj.db.dbh.cursor() for el in xml.iter(): try: if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "table": el.tag = "span" el.attrib["class"] = "xml-table" elif el.tag == "ref" or el.tag == "xref": if el.attrib["type"] == "note" or el.attrib["type"] == "footnote": target = el.attrib["target"] link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target) if "n" in el.attrib: el.text = el.attrib["n"] else: el.text = "*" if el.text == "": el.text = "*" el.tag = "span" el.attrib["data-ref"] = link el.attrib["id"] = target.replace('#', '') + '-link-back' # attributes for popover note el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" elif el.attrib["type"] == "cross": c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],)) try: object_id = c.fetchone()[0] except IndexError: el.tag = "span" continue el.tag = "a" el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"]) el.attrib["class"] = "xml-ref-cross" del el.attrib["target"] elif el.attrib["type"] == "search": metadata, metadata_value = el.attrib["target"].split(':') params = {metadata: metadata_value, "report": "bibliography"} el.tag = "a" el.attrib["href"] = make_absolute_query_link(config, [], **params) del el.attrib["target"] elif el.tag == "note": # endnotes in_end_note = False for ancestor in el.iterancestors(): if ancestor.tag.startswith('div'): if "type" in ancestor.attrib: if ancestor.attrib["type"] == "notes": in_end_note = True break if note: # in footnote el.tag = "div" elif in_end_note: # in end note el.tag = "div" el.attrib['class'] = "xml-note" link_back = etree.Element("a") c.execute('select parent from refs where target=? and parent like ?', (el.attrib['id'], str(philo_id[0]) + " %")) object_id = c.fetchone()[0] link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]), '#%s-link-back' % el.attrib['id']) link_back.attrib['class'] = "btn btn-xs btn-default link-back" link_back.attrib['role'] = "button" link_back.text = "Go back to text" el.append(link_back) else: ## inline notes el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i, child in enumerate(parent): if child == el: attribs = {"class": "note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i, etree.Element("a", attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" elif el.tag == "item": el.tag = "li" elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "img": el.attrib["onerror"] = "this.style.display='none'" elif el.tag == "pb" and "n" in el.attrib: el.tag = "span" el.attrib["class"] = "xml-pb-image" if config.page_images_url_root: if "facs" in el.attrib or "id" in el.attrib: if "facs" in el.attrib: img = el.attrib["facs"] else: img = el.attrib["id"] current_obj_img.append(img.split()[0]) el.append(etree.Element("a")) img_split = img.split() el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension if len(img_split) == 2: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension else: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension el[-1].text = "[page " + el.attrib["n"] + "]" if config.external_page_images: el[-1].attrib["target"] = "_blank" else: el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' else: if el.attrib["n"]: el.text = "--%s--" % el.attrib["n"] else: el.text = "--na--" grand_parent = el.getparent().getparent() if grand_parent.attrib["class"] == "xml-row": # Move page outside of table row to avoid display issues tail = etree.Element("span") tail.text = el.tail el.tail = "" great_grand_parent = grand_parent.getparent() grand_parent_index = great_grand_parent.index(grand_parent) el_index = el.getparent().index(el) great_grand_parent.insert(grand_parent_index+1, el) parent.insert(el_index, tail) if el.tag == "graphic": if config.page_images_url_root: imgs = el.attrib["facs"].split() current_graphic_img.append(imgs[0]) el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0]) el.tag = "img" el.attrib["class"] = "inline-img" el.attrib['data-gallery'] = '' el.attrib["inline-img"] = "" if len(imgs) > 1: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1]) else: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0]) del el.attrib["url"] elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in VALID_HTML_TAGS: el = xml_to_html_class(el) except Exception as exception: import sys print(exception, file=sys.stderr) output = etree.tostring(xml).decode('utf8', 'ignore') ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) output = convert_entities(output) if note: ## Notes don't need to fetch images return (output, {}) if not images: return (output, {}) ## Page images output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id) return output, images
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True): """Format text objects""" philo_id = obj.philo_id if byte_offsets is not None: new_text = b"" last_offset = 0 for b in byte_offsets: new_text += text[last_offset:b] + b"<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] current_obj_img = [] current_graphic_img = [] text = "<div>" + text.decode('utf8', 'ignore') + "</div>" xml = FragmentParserParse(text) c = obj.db.dbh.cursor() for el in xml.iter(): try: if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag == "h1" or el.tag == "h2": el.tag = "b" el.attrib["class"] = "headword" if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" if el.tag == "page": el.tag = "pb" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "table": el.tag = "span" el.attrib["class"] = "xml-table" elif el.tag == "ref" or el.tag == "xref": if el.attrib["type"] == "note" or el.attrib["type"] == "footnote": target = el.attrib["target"] link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target) if "n" in el.attrib: el.text = el.attrib["n"] else: el.text = "*" if el.text == "": el.text = "*" el.tag = "span" el.attrib["data-ref"] = link el.attrib["id"] = target.replace('#', '') + '-link-back' # attributes for popover note el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" elif el.attrib["type"] == "cross": c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],)) try: object_id = c.fetchone()[0] except IndexError: el.tag = "span" continue el.tag = "a" el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"]) el.attrib["class"] = "xml-ref-cross" del el.attrib["target"] elif el.attrib["type"] == "search": metadata, metadata_value = el.attrib["target"].split(':') params = {metadata: metadata_value, "report": "bibliography"} el.tag = "a" el.attrib["href"] = make_absolute_query_link(config, [], **params) del el.attrib["target"] elif el.tag == "note": # endnotes in_end_note = False for ancestor in el.iterancestors(): if ancestor.tag.startswith('div'): if "type" in ancestor.attrib: if ancestor.attrib["type"] == "notes": in_end_note = True break if note: # in footnote el.tag = "div" elif in_end_note: # in end note el.tag = "div" el.attrib['class'] = "xml-note" link_back = etree.Element("a") c.execute('select parent from refs where target=? and parent like ?', (el.attrib['id'], str(philo_id[0]) + " %")) object_id = c.fetchone()[0] link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]), '#%s-link-back' % el.attrib['id']) link_back.attrib['class'] = "btn btn-xs btn-default link-back" link_back.attrib['role'] = "button" link_back.text = "Go back to text" el.append(link_back) else: ## inline notes el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i, child in enumerate(parent): if child == el: attribs = {"class": "note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i, etree.Element("a", attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" elif el.tag == "item": el.tag = "li" elif el.tag == "img": el.attrib["onerror"] = "this.style.display='none'" elif el.tag == "pb" and "n" in el.attrib: el.tag = "span" el.attrib["class"] = "xml-pb-image" if config.page_images_url_root and "facs" in el.attrib or "id" in el.attrib: if "facs" in el.attrib: img = el.attrib["facs"] else: img = el.attrib["id"] current_obj_img.append(img.split()[0]) el.append(etree.Element("a")) img_split = img.split() el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension if len(img_split) == 2: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension else: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension el[-1].text = "[page " + el.attrib["n"] + "]" if config.external_page_images: el[-1].attrib["target"] = "_blank" else: el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' else: if el.attrib["n"]: el.text = "--%s--" % el.attrib["n"] else: el.text = "--na--" grand_parent = el.getparent().getparent() if grand_parent.attrib["class"] == "xml-row": # Move page outside of table row to avoid display issues tail = etree.Element("span") tail.text = el.tail el.tail = "" great_grand_parent = grand_parent.getparent() grand_parent_index = great_grand_parent.index(grand_parent) el_index = el.getparent().index(el) great_grand_parent.insert(grand_parent_index+1, el) parent.insert(el_index, tail) if el.tag == "graphic": if config.page_images_url_root: imgs = el.attrib["facs"].split() current_graphic_img.append(imgs[0]) el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0]) el.tag = "img" el.attrib["class"] = "inline-img" el.attrib['data-gallery'] = '' el.attrib["inline-img"] = "" if len(imgs) > 1: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1]) else: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0]) del el.attrib["url"] elif el.tag == "ptr": if "facs" in el.attrib and config.page_images_url_root: el.tag = "a" el.attrib["href"] = os.path.join(config.page_images_url_root, el.attrib["facs"]) el.text = el.attrib["rend"] el.attrib["external-img"] = "" el.attrib["class"] = "external-img" el.attrib["large-img"] = el.attrib["href"] del el.attrib["rend"] del el.attrib["facs"] el.attrib['data-gallery'] = '' elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in VALID_HTML_TAGS: el = xml_to_html_class(el) except Exception as exception: import sys print(exception, file=sys.stderr) output = etree.tostring(xml).decode('utf8', 'ignore') ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) output = convert_entities(output) if note: ## Notes don't need to fetch images return (output, {}) if not images: return (output, {}) ## Page images output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id) return output, images
def format_concordance(text, word_regex, bytes=[]): word_regex = r"\w+" # text is converted to unicode so we use the \w boundary to match removed_from_start = 0 begin = begin_match.search(text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = start_cutoff_match.search(text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] end = end_match.search(text) if end: text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParserParse(text) allowed_tags = set([ 'philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br' ]) text = u'' for el in xml.iter(): if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag not in allowed_tags: el.tag = 'span' elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' if "id" in el.attrib and el.tag != "l": ## kill ids in order to avoid the risk of having duplicate ids in the HTML del el.attrib["id"] if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "img": # Remove img elements from parent in concordances el.getparent().remove(el) if el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in valid_html_tags: el = xml_to_html_class(el) output = etree.tostring(xml) output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) ## remove spaces around hyphens and apostrophes output = convert_entities(output) output = strip_start_punctuation.sub("", output) return output
def format_concordance(text, word_regex, bytes=[]): word_regex = r"\w+" # text is converted to unicode so we use the \w boundary to match removed_from_start = 0 begin = begin_match.search(text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = start_cutoff_match.search(text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] end = end_match.search(text) if end: text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParserParse(text) allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br']) text = u'' for el in xml.iter(): if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag not in allowed_tags: el.tag = 'span' elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' if "id" in el.attrib and el.tag != "l": ## kill ids in order to avoid the risk of having duplicate ids in the HTML del el.attrib["id"] if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "img": # Remove img elements from parent in concordances el.getparent().remove(el) if el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in valid_html_tags: el = xml_to_html_class(el) output = etree.tostring(xml) output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) ## remove spaces around hyphens and apostrophes output = space_match.sub('\\1', output) output = convert_entities(output) output = strip_start_punctuation.sub("", output) return output