def format_strip(text,bytes=[], chars=40): removed_from_start = 0 begin = re.search(r'^[^<]*?>', text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = re.search(r'^[^ <]+', text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] removed_from_end = 0 end = re.search(r'<[^>]*?\Z', text) if end: removed_from_end = len(end.group(0)) text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParser.parse(text) output = clean_tags(xml) ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-'])+ ", '\\1', output) return output
def format_strip(text, bytes=[]): """Remove formatting for HTML rendering Called from: -kwic.py -frequency.py""" removed_from_start = 0 begin = begin_match.search(text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = start_cutoff_match.search(text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] end = end_match.search(text) if end: text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParser.parse(text) output = clean_tags(xml) ## remove spaces around hyphens and apostrophes output = space_match.sub('\\1', output) return output
def format_concordance(text, word_regex, bytes=[]): removed_from_start = 0 begin = begin_match.search(text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = start_cutoff_match.search(text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] end = end_match.search(text) if end: text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParser.parse(text) allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br']) text = u'' for el in xml.iter(): if el.tag not in allowed_tags: el.tag = 'span' elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' if "id" in el.attrib: ## kill ids in order to avoid the risk of having duplicate ids in the HTML del el.attrib["id"] if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" if el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in valid_html_tags: el = xml_to_html_class(el) output = etree.tostring(xml) output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) ## remove spaces around hyphens and apostrophes output = space_match.sub('\\1', output) output = convert_entities(output) output = strip_start_punctuation.sub("", output) return output
def format_concordance(text, bytes=[]): removed_from_start = 0 begin = begin_match.search(text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = start_cutoff_match.search(text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] removed_from_end = 0 end = end_match.search(text) if end: removed_from_end = len(end.group(0)) text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParser.parse(text) length = 0 allowed_tags = set([ 'philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br' ]) text = u'' for el in xml.iter(): if el.tag not in allowed_tags: el.tag = 'span' elif el.tag == "ab" or el.tag == "ln": el.tag = "l" if "id" in el.attrib: ## kill ids in order to avoid the risk of having duplicate ids in the HTML del el.attrib["id"] if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" if el.tag == "philoHighlight": word_match = re.match(r"\w+", el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" output = etree.tostring(xml) output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) ## remove spaces around hyphens and apostrophes output = space_match.sub('\\1', output) return output
def format(text,bytes=[]): # print >> sys.stderr, "TEXT:",text parser = etree.XMLParser(recover=True) if bytes: new_text = "" last_offset = 0 for b in bytes: new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] text = "<div>" + text + "</div>" xml = FragmentParser.parse(text) print >> sys.stderr, "RAW_XML",etree.tostring(xml) for el in xml.iter(): try: if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" el.append(etree.Element("br")) elif el.tag == "pb" and "fac" in el.attrib and "n" in el.attrib: el.tag = "p" el.append(etree.Element("a")) el[-1].attrib["href"] = 'http://artflx.uchicago.edu/images/encyclopedie/' + el.attrib["fac"] el[-1].text = "[page " + el.attrib["n"] + "]" el[-1].attrib['class'] = "page_image_link" elif el.tag == "figure": if el[0].tag == "graphic": img_url = el[0].attrib["url"].replace(":","_") volume = re.match("\d+",img_url).group() url_prefix = "http://artflx.uchicago.edu/images/encyclopedie/V" + volume + "/plate_" el.tag = "a" el.attrib["href"] = url_prefix + img_url + ".jpeg" el[0].tag = "img" el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg" el[0].attrib["class"] = "plate_img" el.attrib["class"] = "plate_img_link" del el[0].attrib["url"] el.append(etree.Element("br")) elif el.tag == "philoHighlight": word_match = re.match(r"\w+", el.tail, re.U) el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" except: pass output = etree.tostring(xml) ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-'])+ ", '\\1', output) return convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')
def format_concordance(text,bytes=[]): removed_from_start = 0 begin = re.search('^[^<]*?>', text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] removed_from_end = 0 end = re.search('<[^>]*?$', text) if end: removed_from_end = len(end.group(0)) text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParser.parse(text) length = 0 for el in xml.iter(): if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "span" elif el.tag == "pb": el.tag = 'span' elif el.tag == 'p': el.tag ="span" elif el.tag == "philoHighlight": word_match = re.match(r"\w+", el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" output = etree.tostring(xml) output = re.sub(r'\A<div class="philologic-fragment">', '', output) output = re.sub(r'</div>\Z', '', output) return output
def format_strip(text, bytes=[], chars=40, concordance_report=False): """Remove formatting to for HTML rendering Called from: -kwic.py -relevance.py -frequency.py""" removed_from_start = 0 begin = begin_match.search(text) if begin: removed_from_start = len(begin.group(0)) text = text[begin.end(0):] start_cutoff = start_cutoff_match.search(text) if start_cutoff: removed_from_start += len(start_cutoff.group(0)) text = text[start_cutoff.end(0):] removed_from_end = 0 end = end_match.search(text) if end: removed_from_end = len(end.group(0)) text = text[:end.start(0)] if bytes: bytes = [b - removed_from_start for b in bytes] new_text = "" last_offset = 0 for b in bytes: if b > 0 and b < len(text): new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] xml = FragmentParser.parse(text) if concordance_report: output = clean_tags_for_concordance(xml) else: output = clean_tags(xml) ## remove spaces around hyphens and apostrophes output = space_match.sub('\\1', output) return output
def format(text, bytes=[]): parser = etree.XMLParser(recover=True) if bytes: new_text = "" last_offset = 0 for b in bytes: new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] text = "<div>" + text + "</div>" xml = FragmentParser.parse(text) for el in xml.iter(): try: if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" el.append(etree.Element("br")) elif el.tag == "list": el.tag = "ul" elif el.tag == "note": el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = xml_to_html_class(child) elif el.tag == "item": el.tag = "li" elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "pb" and "fac" in el.attrib and "n" in el.attrib: el.tag = "p" el.append(etree.Element("a")) el[-1].attrib[ "href"] = 'http://artflx.uchicago.edu/images/encyclopedie/' + el.attrib[ "fac"] el[-1].text = "[page " + el.attrib["n"] + "]" el[-1].attrib['class'] = "page_image_link" elif el.tag == "figure": if el[0].tag == "graphic": img_url = el[0].attrib["url"].replace(":", "_") volume = re.match("\d+", img_url).group() url_prefix = "http://artflx.uchicago.edu/images/encyclopedie/V" + volume + "/plate_" el.tag = "a" el.attrib["href"] = url_prefix + img_url + ".jpeg" el[0].tag = "img" el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg" el[0].attrib["class"] = "plate_img" el.attrib["class"] = "plate_img_link" del el[0].attrib["url"] el.append(etree.Element("br")) elif el.tag == "philoHighlight": word_match = re.match(r"\w+", el.tail, re.U) el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" except: pass output = etree.tostring(xml) ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) return convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')
def format_text_object(obj, text, config, request, word_regex, bytes=[], note=False): philo_id = obj.philo_id if bytes: new_text = "" last_offset = 0 for b in bytes: new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] first_img = '' current_obj_img = [] text = "<div>" + text + "</div>" xml = FragmentParser.parse(text) c = obj.db.dbh.cursor() for el in xml.iter(): try: if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "ref": if el.attrib["type"] == "note": target = el.attrib["target"] link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target) if "n" in el.attrib: el.text = el.attrib["n"] else: el.text = "note" el.tag = "span" el.attrib["data-ref"] = link el.attrib["id"] = target.replace('#', '') + '-link-back' del el.attrib["target"] # attributes for popover note el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" elif el.tag == "note": # endnotes in_end_note = False for div in el.iterancestors(tag="div"): if div.attrib["type"] == "notes": in_end_note = True break if in_end_note: el.tag = "div" el.attrib['class'] = "xml-note" link_back = etree.Element("a") c.execute('select parent, start_byte from refs where target=? and parent like ?', (el.attrib['id'], str(philo_id[0]) + " %")) object_id, start_byte = c.fetchone() link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join(object_id.split()[:2]), '#%s-link-back' % el.attrib['id']) link_back.attrib['class'] = "btn btn-xs btn-default link-back" link_back.attrib['role'] = "button" link_back.text = "Go back to text" el.append(link_back) else: ## inline notes el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i, child in enumerate(parent): if child == el: attribs = {"class": "note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i, etree.Element("a", attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" elif el.tag == "item": el.tag = "li" elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "pb" and "n" in el.attrib: if "fac" in el.attrib or "id" in el.attrib: if "fac" in el.attrib: img = el.attrib["fac"] else: img = el.attrib["id"] current_obj_img.append(img) el.tag = "p" el.append(etree.Element("a")) el[-1].attrib["href"] = config.page_images_url_root + '/' + img el[-1].text = "[page " + el.attrib["n"] + "]" el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' elif el.tag == "figure": if el[0].tag == "graphic": img_url = el[0].attrib["url"].replace(":", "_") volume = re.match("\d+", img_url).group() url_prefix = config.page_images_url_root + '/V' + volume + "/plate_" el.tag = "span" el.attrib["href"] = url_prefix + img_url + ".jpeg" el[0].tag = "img" el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg" el[0].attrib["class"] = "inline-img" el.attrib["class"] = "inline-img-container" del el[0].attrib["url"] clear_float = etree.Element("span") clear_float.attrib['style'] = 'clear:both;' el[0].append(clear_float) elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in valid_html_tags: el = xml_to_html_class(el) except Exception as e: import sys print >> sys.stderr, e pass output = etree.tostring(xml) ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) output = convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8') if note: ## Notes don't need to fetch images return (output, {}) ## Page images output, img_obj = page_images(config, output, current_obj_img, philo_id) return output, img_obj