Exemple #1
0
 def parse_tei_header(self):
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     deleted_files = []
     for file in os.scandir(self.textdir):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r'<teiheader', file_content, re.I).start()
             end_header_index = re.search(r'</teiheader', file_content, re.I).start()
         except AttributeError:  # tag not found
             deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = etree.XMLParser(recover=True)
         try:
             tree = etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     attr_pattern_match = re.search(r"@([^\/\[\]]+)$", xpath)
                     if attr_pattern_match:
                         xp_prefix = xpath[:attr_pattern_match.start(0)]
                         attr_name = attr_pattern_match.group(1)
                         elements = tree.findall(xp_prefix)
                         for el in elements:
                             if el is not None and el.get(attr_name, ""):
                                 data[field] = el.get(attr_name, "")
                                 break
                     else:
                         el = tree.find(xpath)
                         if el is not None and el.text is not None:
                             data[field] = el.text
                             break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field)
                 for metadata_type in ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except etree.XMLSyntaxError:
             deleted_files.append(f)
     if deleted_files:
         for f in deleted_files:
             print("%s has no valid TEI header or contains invalid data: removing from database load..." % f)
     return load_metadata
Exemple #2
0
 def parse_dc_header(self):
     """Parse Dublin Core header"""
     load_metadata = []
     for file in os.scandir(self.textdir):
         data = {}
         header = ""
         with open(file.path) as fh:
             for line in fh:
                 start_scan = re.search(r"<teiheader>|<temphead>|<head>",
                                        line, re.IGNORECASE)
                 end_scan = re.search(r"</teiheader>|<\/?temphead>|</head>",
                                      line, re.IGNORECASE)
                 if start_scan:
                     header += line[start_scan.start():]
                 elif end_scan:
                     header += line[:end_scan.end()]
                     break
                 else:
                     header += line
         matches = re.findall(r'<meta name="DC\.([^"]+)" content="([^"]+)"',
                              header)
         if not matches:
             matches = re.findall(r"<dc:([^>]+)>([^>]+)>", header)
         for metadata_name, metadata_value in matches:
             metadata_value = convert_entities(metadata_value)
             metadata_name = metadata_name.lower()
             data[metadata_name] = metadata_value
         data[
             "filename"] = file.name  # place at the end in case the value was in the header
         data = self.create_year_field(data)
         if self.debug:
             print(pretty_print(data))
         load_metadata.append(data)
     return load_metadata
def format_concordance(text_in_utf8, word_regex, byte_offsets=[]):
    removed_from_start = 0
    begin = BEGIN_MATCH.search(text_in_utf8)
    if begin:
        removed_from_start = len(begin.group(0))
        text_in_utf8 = text_in_utf8[begin.end(0):]
    start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text_in_utf8 = text_in_utf8[start_cutoff.end(0):]
    end = END_MATCH.search(text_in_utf8)
    if end:
        text_in_utf8 = text_in_utf8[:end.start(0)]
    if byte_offsets:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text_in_utf8):
                new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>"
                last_offset = b
        text_in_utf8 = new_text + text_in_utf8[last_offset:]
    text = text_in_utf8.decode('utf8', 'ignore')
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in VALID_HTML_TAGS:
            el = xml_to_html_class(el)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = SPACE_MATCH.sub('\\1', output)
    output = convert_entities(output)
    output = STRIP_START_PUNCTUATION.sub("", output)
    return output
Exemple #4
0
 def parse_dc_header(self):
     load_metadata = []
     for filename in self.list_files():
         data = {}
         fn = self.textdir + filename
         header = ""
         with open(fn) as fh:
             for line in fh:
                 start_scan = re.search("<teiheader>|<temphead>|<head>", line, re.IGNORECASE)
                 end_scan = re.search("</teiheader>|<\/?temphead>|</head>", line, re.IGNORECASE)
                 if start_scan:
                     header += line[start_scan.start():]
                 elif end_scan:
                     header += line[:end_scan.end()]
                     break
                 else:
                     header += line
         matches = re.findall('<meta name="DC\.([^"]+)" content="([^"]+)"', header)
         if not matches:
             matches = re.findall('<dc:([^>]+)>([^>]+)>', header)
         for metadata_name, metadata_value in matches:
             metadata_value = metadata_value
             metadata_value = convert_entities(metadata_value.decode('utf-8')).encode('utf-8')
             metadata_name = metadata_name.lower()
             data[metadata_name] = metadata_value
         data["filename"] = filename  # place at the end in case the value was in the header
         data = self.create_year_field(data)
         if self.debug:
             print(pretty_print(data))
         load_metadata.append(data)
     return load_metadata
Exemple #5
0
def clean_tags(element):
    """Remove all tags"""
    text = u''
    for child in element:
        text += clean_tags(child)
    if element.tag == "philoHighlight":
        word_match = term_match.match(convert_entities(element.tail))
        if word_match:
            return '<span class="highlight">' + element.text + text + element.tail[:word_match.end(
            )] + "</span>" + element.tail[word_match.end():]
        text = element.text + text + element.tail
        return '<span class="highlight">' + element.text + text + "</span>" + element.tail
    return element.text + text + element.tail
Exemple #6
0
def clean_tags(element):
    """Remove all tags"""
    text = u''
    for child in element:
        text += clean_tags(child)
    if element.tag == "philoHighlight":
        word_match = term_match.match(convert_entities(element.tail))
        if word_match:
            return '<span class="highlight">' + element.text + text + element.tail[:word_match.end(
            )] + "</span>" + element.tail[word_match.end():]
        text = element.text + text + element.tail
        return '<span class="highlight">' + element.text + text + "</span>" + element.tail
    return element.text + text + element.tail
Exemple #7
0
 def parse_tei_header(self):
     """Parse header in TEI files"""
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     doc_count = len(os.listdir(self.textdir))
     for pos, file in enumerate(os.scandir(self.textdir)):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 self.deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r"<teiheader", file_content,
                                            re.I).start()
             end_header_index = re.search(r"</teiheader", file_content,
                                          re.I).start()
         except AttributeError:  # tag not found
             self.deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = lxml.etree.XMLParser(recover=True)
         try:
             tree = lxml.etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     xpath = xpath.rstrip(
                         "/"
                     )  # make sure there are no trailing slashes which make lxml die
                     try:
                         elements = tree.xpath(xpath)
                     except lxml.etree.XPathEvalError:
                         continue
                     for element in elements:
                         if element is not None:
                             value = ""
                             if isinstance(element, lxml.etree._Element
                                           ) and element.text is not None:
                                 value = element.text.strip()
                             elif isinstance(
                                     element,
                                     lxml.etree._ElementUnicodeResult):
                                 value = str(element).strip()
                             if value:
                                 data[field] = value
                                 break
                     else:  # only continue looping over xpaths if no break in inner loop
                         continue
                     break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field) for metadata_type in
                 ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths
                 for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except lxml.etree.XMLSyntaxError:
             self.deleted_files.append(file.name)
         print(
             f"\r{time.ctime()}: Parsing document level metadata: {pos+1}/{doc_count} done...",
             flush=True,
             end="")
     if self.deleted_files:
         for f in self.deleted_files:
             print(
                 "%s has no valid TEI header or contains invalid data: removing from database load..."
                 % f)
     return load_metadata
Exemple #8
0
 def parse_tei_header(self):
     """Parse header in TEI files"""
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     self.deleted_files = []
     for file in os.scandir(self.textdir):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 self.deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r"<teiheader", file_content, re.I).start()
             end_header_index = re.search(r"</teiheader", file_content, re.I).start()
         except AttributeError:  # tag not found
             self.deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = etree.XMLParser(recover=True)
         try:
             tree = etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     attr_pattern_match = re.search(r"@([^\/\[\]]+)$", xpath)
                     if attr_pattern_match:
                         xp_prefix = xpath[: attr_pattern_match.start(0)]
                         attr_name = attr_pattern_match.group(1)
                         elements = tree.findall(xp_prefix)
                         for el in elements:
                             if el is not None and el.get(attr_name, ""):
                                 data[field] = el.get(attr_name, "")
                                 break
                     else:
                         el = tree.find(xpath)
                         if el is not None and el.text is not None:
                             data[field] = el.text
                             break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field)
                 for metadata_type in ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths
                 for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except etree.XMLSyntaxError:
             self.deleted_files.append(file.name)
     if self.deleted_files:
         for f in self.deleted_files:
             print("%s has no valid TEI header or contains invalid data: removing from database load..." % f)
     return load_metadata
def format_concordance(text_in_utf8, word_regex, byte_offsets=[]):
    removed_from_start = 0
    begin = BEGIN_MATCH.search(text_in_utf8)
    if begin:
        removed_from_start = len(begin.group(0))
        text_in_utf8 = text_in_utf8[begin.end(0):]
    start_cutoff = START_CUTOFF_MATCH.search(text_in_utf8)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text_in_utf8 = text_in_utf8[start_cutoff.end(0):]
    end = END_MATCH.search(text_in_utf8)
    if end:
        text_in_utf8 = text_in_utf8[:end.start(0)]
    if byte_offsets:
        byte_offsets = [b - removed_from_start for b in byte_offsets]
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            if b > 0 and b < len(text_in_utf8):
                new_text += text_in_utf8[last_offset:b] + b"<philoHighlight/>"
                last_offset = b
        text_in_utf8 = new_text + text_in_utf8[last_offset:]
    text = text_in_utf8.decode('utf8', 'ignore')
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in VALID_HTML_TAGS:
            el = xml_to_html_class(el)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = SPACE_MATCH.sub('\\1', output)
    output = convert_entities(output)
    output = STRIP_START_PUNCTUATION.sub("", output)
    return output
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                    try:
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                        continue
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                                break
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root:
                    if "facs" in el.attrib or "id" in el.attrib:
                        if "facs" in el.attrib:
                            img = el.attrib["facs"]
                        else:
                            img = el.attrib["id"]
                        current_obj_img.append(img.split()[0])
                        el.append(etree.Element("a"))
                        img_split = img.split()
                        el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        if len(img_split) == 2:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                        else:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        el[-1].text = "[page " + el.attrib["n"] + "]"
                        if config.external_page_images:
                            el[-1].attrib["target"] = "_blank"
                        else:
                            el[-1].attrib['class'] = "page-image-link"
                            el[-1].attrib['data-gallery'] = ''
                else:
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                    else:
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    current_graphic_img.append(imgs[0])
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                    else:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images
Exemple #11
0
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "h1" or el.tag == "h2":
                el.tag = "b"
                el.attrib["class"] = "headword"
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            if el.tag == "page":
                el.tag = "pb"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                    try:
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                        continue
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                                break
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root and "facs" in el.attrib or "id" in el.attrib:
                    if "facs" in el.attrib:
                        img = el.attrib["facs"]
                    else:
                        img = el.attrib["id"]
                    current_obj_img.append(img.split()[0])
                    el.append(etree.Element("a"))
                    img_split = img.split()
                    el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                    if len(img_split) == 2:
                        el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                    else:
                        el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                    el[-1].text = "[page " + el.attrib["n"] + "]"
                    if config.external_page_images:
                        el[-1].attrib["target"] = "_blank"
                    else:
                        el[-1].attrib['class'] = "page-image-link"
                        el[-1].attrib['data-gallery'] = ''
                else:
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                    else:
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    current_graphic_img.append(imgs[0])
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                    else:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "ptr":
                if "facs" in el.attrib and config.page_images_url_root:
                    el.tag = "a"
                    el.attrib["href"] = os.path.join(config.page_images_url_root, el.attrib["facs"])
                    el.text = el.attrib["rend"]
                    el.attrib["external-img"] = ""
                    el.attrib["class"] = "external-img"
                    el.attrib["large-img"] = el.attrib["href"]
                    del el.attrib["rend"]
                    del el.attrib["facs"]
                    el.attrib['data-gallery'] = ''
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images
Exemple #12
0
def format_concordance(text, word_regex, bytes=[]):
    word_regex = r"\w+"  # text is converted to unicode so we use the \w boundary to match
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text)
    allowed_tags = set([
        'philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i',
        'sc', 'scx', 'br'
    ])
    text = u''
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = convert_entities(output)
    output = strip_start_punctuation.sub("", output)
    return output
Exemple #13
0
def format_concordance(text, word_regex, bytes=[]):
    word_regex = r"\w+"  # text is converted to unicode so we use the \w boundary to match
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParserParse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    text = u''
    for el in xml.iter():
        if el.tag.startswith("DIV"):
            el.tag = el.tag.lower()
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib and el.tag != "l":  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "img":  # Remove img elements from parent in concordances
            el.getparent().remove(el)
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    output = convert_entities(output)
    output = strip_start_punctuation.sub("", output)
    return output
Exemple #14
0
 def parse_tei_header(self):
     """Parse header in TEI files"""
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     for file in os.scandir(self.textdir):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 self.deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r"<teiheader", file_content, re.I).start()
             end_header_index = re.search(r"</teiheader", file_content, re.I).start()
         except AttributeError:  # tag not found
             self.deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = lxml.etree.XMLParser(recover=True)
         try:
             tree = lxml.etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     xpath = xpath.rstrip("/") # make sure there are no trailing slashes which make lxml die
                     try:
                         elements = tree.xpath(xpath)
                     except lxml.etree.XPathEvalError:
                         continue
                     for element in elements:
                         if element is not None:
                             value = ""
                             if isinstance(element, lxml.etree._Element) and element.text is not None:
                                 value = element.text.strip()
                             elif isinstance(element, lxml.etree._ElementUnicodeResult):
                                 value = str(element).strip()
                             if value:
                                 data[field] = value
                                 break
                     else: # only continue looping over xpaths if no break in inner loop
                         continue
                     break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field)
                 for metadata_type in ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths
                 for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except lxml.etree.XMLSyntaxError:
             self.deleted_files.append(file.name)
     if self.deleted_files:
         for f in self.deleted_files:
             print("%s has no valid TEI header or contains invalid data: removing from database load..." % f)
     return load_metadata