Python FragmentParserの例、FragmentParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: ARTFL-Project/Encyclopedie

def format_strip(text,bytes=[], chars=40):
    removed_from_start = 0
    begin = re.search(r'^[^<]*?>', text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = re.search(r'^[^ <]+', text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    removed_from_end = 0
    end = re.search(r'<[^>]*?\Z', text)
    if end:
        removed_from_end = len(end.group(0))
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParser.parse(text)
    output = clean_tags(xml)
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-'])+ ", '\\1', output)
    return output

コード例 #2

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: mbwolff/PhiloLogic4

def format_strip(text, bytes=[]):
    """Remove formatting for HTML rendering
    Called from: -kwic.py
                 -frequency.py"""
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParser.parse(text)
    output = clean_tags(xml)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    return output

コード例 #3

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: katelaurel/PhiloLogic4

def format_concordance(text, word_regex, bytes=[]):
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    end = end_match.search(text)
    if end:
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParser.parse(text)
    allowed_tags = set(['philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i', 'sc', 'scx', 'br'])
    text = u''
    for el in xml.iter():
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        elif el.tag == "title":
            el.tag = "span"
            el.attrib['class'] = "xml-title"
        elif el.tag == "q":
            el.tag = "span"
            el.attrib['class'] = 'xml-q'
        if "id" in el.attrib:  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        if el.tag == "philoHighlight":
            word_match = re.match(word_regex, el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
        if el.tag not in valid_html_tags:
            el = xml_to_html_class(el)
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    output = convert_entities(output)
    output = strip_start_punctuation.sub("", output)
    return output

コード例 #4

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: pleonard212/PhiloLogic4

def format_concordance(text, bytes=[]):
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    removed_from_end = 0
    end = end_match.search(text)
    if end:
        removed_from_end = len(end.group(0))
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParser.parse(text)
    length = 0
    allowed_tags = set([
        'philoHighlight', 'l', 'ab', 'ln', 'w', 'sp', 'speaker', 'stage', 'i',
        'sc', 'scx', 'br'
    ])
    text = u''
    for el in xml.iter():
        if el.tag not in allowed_tags:
            el.tag = 'span'
        elif el.tag == "ab" or el.tag == "ln":
            el.tag = "l"
        if "id" in el.attrib:  ## kill ids in order to avoid the risk of having duplicate ids in the HTML
            del el.attrib["id"]
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        if el.tag == "philoHighlight":
            word_match = re.match(r"\w+", el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    return output

コード例 #5

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: ARTFL-Project/Encyclopedie

def format(text,bytes=[]):
#    print >> sys.stderr, "TEXT:",text
    parser = etree.XMLParser(recover=True)
    if bytes:
        new_text = ""
        last_offset = 0
        for b in bytes:
            new_text += text[last_offset:b] + "<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    text = "<div>" + text + "</div>"
    xml = FragmentParser.parse(text)
    print >> sys.stderr, "RAW_XML",etree.tostring(xml)
    for el in xml.iter():        
        try:
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
                el.append(etree.Element("br"))
            elif el.tag == "pb" and "fac" in el.attrib and "n" in el.attrib:
                el.tag = "p"
                el.append(etree.Element("a"))
                el[-1].attrib["href"] = 'http://artflx.uchicago.edu/images/encyclopedie/' + el.attrib["fac"]
                el[-1].text = "[page " + el.attrib["n"] + "]"
                el[-1].attrib['class'] = "page_image_link"
            elif el.tag == "figure":
                if el[0].tag == "graphic":
                    img_url = el[0].attrib["url"].replace(":","_")
                    volume = re.match("\d+",img_url).group()
                    url_prefix = "http://artflx.uchicago.edu/images/encyclopedie/V" + volume + "/plate_"
                    el.tag = "a"
                    el.attrib["href"] = url_prefix + img_url + ".jpeg"
                    el[0].tag = "img"
                    el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg"
                    el[0].attrib["class"] = "plate_img"
                    el.attrib["class"] = "plate_img_link"
                    del el[0].attrib["url"]
                    el.append(etree.Element("br"))
            elif el.tag == "philoHighlight":        
                word_match = re.match(r"\w+", el.tail, re.U)
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
        except:
            pass
    output = etree.tostring(xml)
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-'])+ ", '\\1', output)
    return convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')

コード例 #6

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: ARTFL-Project/Encyclopedie

def format_concordance(text,bytes=[]):
    removed_from_start = 0
    begin = re.search('^[^<]*?>', text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    removed_from_end = 0
    end = re.search('<[^>]*?$', text)
    if end:
        removed_from_end = len(end.group(0))
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParser.parse(text)
    length = 0
    for el in xml.iter():
        if el.tag == "sc" or el.tag == "scx":
            el.tag = "span"
            el.attrib["class"] = "small-caps"
        elif el.tag == "head":
            el.tag = "span"
        elif el.tag == "pb":
            el.tag = 'span'
        elif el.tag == 'p':
            el.tag ="span"
        elif el.tag == "philoHighlight":        
            word_match = re.match(r"\w+", el.tail, re.U)
            if word_match:
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
            el.tag = "span"
            el.attrib["class"] = "highlight"
    
    output = etree.tostring(xml)
    output = re.sub(r'\A<div class="philologic-fragment">', '', output)
    output = re.sub(r'</div>\Z', '', output)
    return output

コード例 #7

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: pleonard212/PhiloLogic4

def format_strip(text, bytes=[], chars=40, concordance_report=False):
    """Remove formatting to for HTML rendering
    Called from: -kwic.py
                 -relevance.py
                 -frequency.py"""
    removed_from_start = 0
    begin = begin_match.search(text)
    if begin:
        removed_from_start = len(begin.group(0))
        text = text[begin.end(0):]
    start_cutoff = start_cutoff_match.search(text)
    if start_cutoff:
        removed_from_start += len(start_cutoff.group(0))
        text = text[start_cutoff.end(0):]
    removed_from_end = 0
    end = end_match.search(text)
    if end:
        removed_from_end = len(end.group(0))
        text = text[:end.start(0)]
    if bytes:
        bytes = [b - removed_from_start for b in bytes]
        new_text = ""
        last_offset = 0
        for b in bytes:
            if b > 0 and b < len(text):
                new_text += text[last_offset:b] + "<philoHighlight/>"
                last_offset = b
        text = new_text + text[last_offset:]
    xml = FragmentParser.parse(text)
    if concordance_report:
        output = clean_tags_for_concordance(xml)
    else:
        output = clean_tags(xml)
    ## remove spaces around hyphens and apostrophes
    output = space_match.sub('\\1', output)
    return output

コード例 #8

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: pleonard212/PhiloLogic4

def format(text, bytes=[]):
    parser = etree.XMLParser(recover=True)
    if bytes:
        new_text = ""
        last_offset = 0
        for b in bytes:
            new_text += text[last_offset:b] + "<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    text = "<div>" + text + "</div>"
    xml = FragmentParser.parse(text)
    for el in xml.iter():
        try:
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
                el.append(etree.Element("br"))
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "note":
                el.tag = 'span'
                el.attrib['class'] = "note-content"
                for child in el:
                    child = xml_to_html_class(child)
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "pb" and "fac" in el.attrib and "n" in el.attrib:
                el.tag = "p"
                el.append(etree.Element("a"))
                el[-1].attrib[
                    "href"] = 'http://artflx.uchicago.edu/images/encyclopedie/' + el.attrib[
                        "fac"]
                el[-1].text = "[page " + el.attrib["n"] + "]"
                el[-1].attrib['class'] = "page_image_link"
            elif el.tag == "figure":
                if el[0].tag == "graphic":
                    img_url = el[0].attrib["url"].replace(":", "_")
                    volume = re.match("\d+", img_url).group()
                    url_prefix = "http://artflx.uchicago.edu/images/encyclopedie/V" + volume + "/plate_"
                    el.tag = "a"
                    el.attrib["href"] = url_prefix + img_url + ".jpeg"
                    el[0].tag = "img"
                    el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg"
                    el[0].attrib["class"] = "plate_img"
                    el.attrib["class"] = "plate_img_link"
                    del el[0].attrib["url"]
                    el.append(etree.Element("br"))
            elif el.tag == "philoHighlight":
                word_match = re.match(r"\w+", el.tail, re.U)
                el.text = el.tail[:word_match.end()]
                el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
        except:
            pass
    output = etree.tostring(xml)
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    return convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')

コード例 #9

0

ファイルを表示

ファイル: ObjectFormatter.py プロジェクト: katelaurel/PhiloLogic4

def format_text_object(obj, text, config, request, word_regex, bytes=[], note=False):
    philo_id = obj.philo_id
    if bytes:
        new_text = ""
        last_offset = 0
        for b in bytes:
            new_text += text[last_offset:b] + "<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    first_img = ''
    current_obj_img = []
    text = "<div>" + text + "</div>"
    xml = FragmentParser.parse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "ref":
                if el.attrib["type"] == "note":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "note"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    del el.attrib["target"]
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for div in el.iterancestors(tag="div"):
                    if div.attrib["type"] == "notes":
                        in_end_note = True
                        break
                if in_end_note:
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent, start_byte from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id, start_byte = c.fetchone()
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join(object_id.split()[:2]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"
                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "pb" and "n" in el.attrib:
                if "fac" in el.attrib or "id" in el.attrib:
                    if "fac" in el.attrib:
                        img = el.attrib["fac"]
                    else:
                        img = el.attrib["id"]
                    current_obj_img.append(img)
                    el.tag = "p"
                    el.append(etree.Element("a"))
                    el[-1].attrib["href"] = config.page_images_url_root + '/' + img
                    el[-1].text = "[page " + el.attrib["n"] + "]"
                    el[-1].attrib['class'] = "page-image-link"
                    el[-1].attrib['data-gallery'] = ''
            elif el.tag == "figure":
                if el[0].tag == "graphic":
                    img_url = el[0].attrib["url"].replace(":", "_")
                    volume = re.match("\d+", img_url).group()
                    url_prefix = config.page_images_url_root + '/V' + volume + "/plate_"
                    el.tag = "span"
                    el.attrib["href"] = url_prefix + img_url + ".jpeg"
                    el[0].tag = "img"
                    el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg"
                    el[0].attrib["class"] = "inline-img"
                    el.attrib["class"] = "inline-img-container"
                    del el[0].attrib["url"]
                    clear_float = etree.Element("span")
                    clear_float.attrib['style'] = 'clear:both;'
                    el[0].append(clear_float)
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in valid_html_tags:
                el = xml_to_html_class(el)
        except Exception as e:
            import sys
            print >> sys.stderr, e
            pass
    output = etree.tostring(xml)
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')

    if note:  ## Notes don't need to fetch images
        return (output, {})

    ## Page images
    output, img_obj = page_images(config, output, current_obj_img, philo_id)

    return output, img_obj

Python FragmentParser, PhiloLogic4の例