Python BeautifulSoup Examples, sigil_bs4.BeautifulSoup Python Examples

Example #1

0

Show file

def repairXML(data, mtype="", indent_chars="  "):
    newdata = _remove_xml_header(data)
    # if well-formed - don't mess with it
    if _well_formed(newdata):
        return data
    newdata = _make_it_sane(newdata)
    if not _well_formed(newdata):
        newdata = _reformat(newdata)
        if mtype == "application/oebps-package+xml":
            newdata = newdata.decode('utf-8')
            newdata = Opf_Parser(newdata).rebuild_opfxml()
    # lxml requires utf-8 on Mac, won't work with unicode
    if isinstance(newdata, str):
        newdata = newdata.encode('utf-8')
    voidtags = get_void_tags(mtype)
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=voidtags)
    soup = BeautifulSoup(newdata,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars=indent_chars)
    return newdata

Example #2

0

Show file

File: xmlprocessor.py Project: PippaCarron/Sigil

def anchorNCXUpdates(data, originating_filename, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary
    id_dict = {}
    for i in range(0, len(keylist)):
        id_dict[keylist[i]] = valuelist[i]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    original_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + originating_filename
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                if (parts is not None) and (len(parts) > 1) and (
                        parts[0] == original_filename_with_relative_path) and (
                            parts[1] != ""):
                    fragment_id = parts[1]
                    if fragment_id in id_dict:
                        attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl(
                            id_dict[fragment_id]) + "#" + fragment_id
                        tag["src"] = attribute_value
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars="  ")
    return newdata

Example #3

0

Show file

File: xmlprocessor.py Project: uchuugaka/Sigil

def performNCXSourceUpdates(data, currentdir, keylist, valuelist):
    # rebuild serialized lookup dictionary
    updates = {}
    for i in range(0, len(keylist)):
        updates[ keylist[i] ] = valuelist[i]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                url = parts[0]
                fragment = ""
                if len(parts) > 1:
                    fragment = parts[1]
                bookrelpath = os.path.join(currentdir, unquoteurl(url))
                bookrelpath = os.path.normpath(bookrelpath)
                bookrelpath = bookrelpath.replace(os.sep, "/")
                if bookrelpath in updates:
                    attribute_value = updates[bookrelpath]
                    if fragment != "":
                        attribute_value = attribute_value + "#" + fragment
                    attribute_value = quoteurl(attribute_value)
                    tag["src"] = attribute_value
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata

Example #4

0

Show file

File: xmlprocessor.py Project: hobo71/Sigil

def performPageMapUpdates(data, currentdir, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary of xml_updates properly adjusted
    updates = {}
    for i in range(0, len(keylist)):
        updates[ keylist[i] ] = "../" + valuelist[i]
    xml_empty_tags = ["page"]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all(["page"]):
        for att in ["href"]:
            if att in tag.attrs :
                ref = tag[att]
                if ref.find(":") == -1 :
                    parts = ref.split('#')
                    url = parts[0]
                    fragment = ""
                    if len(parts) > 1:
                        fragment = parts[1]
                    bookrelpath = os.path.join(currentdir, unquoteurl(url))
                    bookrelpath = os.path.normpath(bookrelpath)
                    bookrelpath = bookrelpath.replace(os.sep, "/")
                    if bookrelpath in updates:
                        attribute_value = updates[bookrelpath]
                        if fragment != "":
                            attribute_value = attribute_value + "#" + fragment
                        attribute_value = quoteurl(attribute_value)
                        tag[att] = attribute_value
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata

Example #5

0

Show file

File: xmlprocessor.py Project: nyulacska/Sigil

def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary of xml_updates properly adjusted
    updates = {}
    for i in range(0, len(keylist)):
        updates[ keylist[i] ] = valuelist[i]
    xml_empty_tags = ["page"]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all(["page"]):
        for att in ["href"]:
            if att in tag.attrs :
                ref = tag[att]
                if ref.find(":") == -1 :
                    parts = ref.split('#')
                    ahref = unquoteurl(parts[0])
                    fragment = ""
                    if len(parts) > 1:
                        fragment = parts[1]
                    oldtarget = buildBookPath(ahref, startingDir(oldbkpath))
                    newtarget = updates.get(oldtarget, oldtarget)
                    attribute_value = buildRelativePath(newbkpath, newtarget)
                    if fragment != "":
                        attribute_value = attribute_value + "#" + fragment
                    attribute_value = quoteurl(attribute_value)
                    tag[att] = attribute_value
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata

Example #6

0

Show file

File: xmlprocessor.py Project: nyulacska/Sigil

def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary
    id_dict = {}
    for i in range(0, len(keylist)):
        id_dict[ keylist[i] ] = valuelist[i]
    startdir = startingDir(ncx_bookpath)
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                ahref = unquoteurl(parts[0])
                # convert this href to its target bookpath
                target_bookpath = buildBookPath(ahref,startdir)
                if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""):
                    fragment_id = parts[1]
                    if fragment_id in id_dict:
                        target_bookpath = id_dict[fragment_id]
                        attribute_value = buildRelativePath(ncx_bookpath, target_bookpath) + "#" + fragment_id
                        tag["src"] = quoteurl(attribute_value)
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata

Example #7

0

Show file

File: diagnose.py Project: zyhong/Sigil

def diagnose(data):
    """Diagnostic suite for isolating common problems."""
    print("Diagnostic running on Beautiful Soup %s" % __version__)
    print("Python version %s" % sys.version)

    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
        for builder in builder_registry.builders:
            if name in builder.features:
                break
        else:
            basic_parsers.remove(name)
            print((
                "I noticed that %s is not installed. Installing it may help." %
                name))

    if 'lxml' in basic_parsers:
        basic_parsers.append(["lxml", "xml"])
        try:
            from lxml import etree
            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
        except ImportError as e:
            print (
                "lxml is not installed or couldn't be imported.")


    if 'html5lib' in basic_parsers:
        try:
            import html5lib
            print("Found html5lib version %s" % html5lib.__version__)
        except ImportError as e:
            print (
                "html5lib is not installed or couldn't be imported.")

    if hasattr(data, 'read'):
        data = data.read()
    elif os.path.exists(data):
        print('"%s" looks like a filename. Reading data from the file.' % data)
        data = open(data).read()
    elif data.startswith("http:") or data.startswith("https:"):
        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
        return
    print()

    for parser in basic_parsers:
        print("Trying to parse your markup with %s" % parser)
        success = False
        try:
            soup = BeautifulSoup(data, parser)
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("Here's what %s did with the markup:" % parser)
            print(soup.prettify())

        print("-" * 80)

Example #8

0

Show file

File: xmlprocessor.py Project: EKDG-SAM/Sigil

def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars="  "):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=self_closing_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
    return newdata

Example #9

0

Show file

File: xmlprocessor.py Project: JksnFst/Sigil

def repairXML(data, mtype="", indent_chars="  "):
    data = _remove_xml_header(data)
    data = _make_it_sane(data)
    voidtags = get_void_tags(mtype)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=voidtags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
    return newdata

Example #10

0

Show file

File: xmlprocessor.py Project: robyscar/sigil

def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars="  "):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=self_closing_tags)
    soup = BeautifulSoup(data,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars=indent_chars)
    return newdata

Example #11

0

Show file

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))

    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b - a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b - a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b - a))

Example #12

0

Show file

File: xmlprocessor.py Project: Sigil-Ebook/Sigil

def repairXML(data, mtype="", indent_chars="  "):
    newdata = _remove_xml_header(data)
    # if well-formed - don't mess with it
    if _well_formed(newdata):
        return data
    newdata = _make_it_sane(newdata)
    if not _well_formed(newdata):
        newdata = _reformat(newdata)
        if mtype == "application/oebps-package+xml":
            newdata = newdata.decode('utf-8')
            newdata = Opf_Parser(newdata).rebuild_opfxml()
    # lxml requires utf-8 on Mac, won't work with unicode
    if isinstance(newdata, str):
        newdata = newdata.encode('utf-8')
    voidtags = get_void_tags(mtype)
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=voidtags)
    soup = BeautifulSoup(newdata, features=None, from_encoding="utf-8", builder=xmlbuilder)
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
    return newdata

Example #13

0

Show file

File: xmlprocessor.py Project: uchuugaka/Sigil

def anchorNCXUpdates(data, originating_filename, keylist, valuelist):
    # rebuild serialized lookup dictionary
    id_dict = {}
    for i in range(0, len(keylist)):
        id_dict[ keylist[i] ] = valuelist[i]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
    original_filename_with_relative_path = TEXT_FOLDER_NAME  + "/" + originating_filename
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                if (parts is not None) and (len(parts) > 1) and (parts[0] == original_filename_with_relative_path) and (parts[1] != ""):
                    fragment_id = parts[1]
                    if fragment_id in id_dict:
                        attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl(id_dict[fragment_id]) + "#" + fragment_id
                        tag["src"] = attribute_value
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata

Example #14

0

Show file

File: xmlprocessor.py Project: nyulacska/Sigil

def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath, merged_bookpaths):
    data = _remove_xml_header(data)
    startdir = startingDir(ncx_bookpath)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                if parts is not None:
                    ahref = unquoteurl(parts[0])
                    target_bookpath = buildBookPath(ahref, startdir)
                    if target_bookpath in merged_bookpaths:
                        attribute_value = buildRelativePath(ncx_bookpath, sink_bookpath)
                        if len(parts) > 1 and parts[1] != "":
                            attribute_value += "#" + parts[1]
                        tag["src"] = quoteurl(attribute_value)
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata

Example #15

0

Show file

def performPageMapUpdates(data, currentdir, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary of xml_updates properly adjusted
    updates = {}
    for i in range(0, len(keylist)):
        updates[keylist[i]] = "../" + valuelist[i]
    xml_empty_tags = ["page"]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=xml_empty_tags)
    soup = BeautifulSoup(data,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    for tag in soup.find_all(["page"]):
        for att in ["href"]:
            if att in tag.attrs:
                ref = tag[att]
                if ref.find(":") == -1:
                    parts = ref.split('#')
                    url = parts[0]
                    fragment = ""
                    if len(parts) > 1:
                        fragment = parts[1]
                    bookrelpath = os.path.join(currentdir, unquoteurl(url))
                    bookrelpath = os.path.normpath(bookrelpath)
                    bookrelpath = bookrelpath.replace(os.sep, "/")
                    if bookrelpath in updates:
                        attribute_value = updates[bookrelpath]
                        if fragment != "":
                            attribute_value = attribute_value + "#" + fragment
                        attribute_value = quoteurl(attribute_value)
                        tag[att] = attribute_value
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars="  ")
    return newdata

Example #16

0

Show file

    def ProcessTextFile(self):
        """
        This method runs when the button marked 'Get text file' is clicked
        """
        #Request name of file to open
        FILEOPENOPTIONS = dict(title='Choose a text file to import',
                               initialfile='',
                               filetypes=[('Text files', ('.txt')),
                                          ('All files', ('*.*'))])
        fHandle = filedialog.askopenfile(**FILEOPENOPTIONS)

        #Get the encoding of the text file
        with open(fHandle.name, "rb") as binary_file:
            data = binary_file.read()
            soup = BeautifulSoup(data)

        #Read the file
        with open(fHandle.name, 'rt', encoding=soup.original_encoding) as f:
            content = f.readlines()
            content = [x.strip() for x in content]

        #Replace angular brackets if required
        if self.is_checked.get() == True:
            print("Changing brackets")
            content = [x.replace('<', '&lt;') for x in content]
            content = [x.replace('>', '&gt;') for x in content]
        """
        #Replace newlines with paragraph tags
        bodyText = bodyText.replace('\n', '</p>\n\n<p>').replace('\r', '')
        bodyText= bodyText.replace('<p></p>', '<p>&nbsp;</p>')
        """

        # use PTagger to tag new lines
        if self.rule is None:
            self.set_rules()

        tagger = PTagger(self.rule)
        content = tagger.tag(content)

        #Now write the xhtml file
        xml = '<?xml version="1.0" encoding="utf-8"?>\n'
        xml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n\n'
        xml += '<html xmlns="http://www.w3.org/1999/xhtml">\n'
        xml += '<head>\n'
        xml += '<title></title>\n'
        xml += '</head>\n'
        xml += '<body>\n'
        for row in content:
            xml += row + '\n'
        xml += '</body>\n'
        xml += '</html>\n'

        #Set the name of the new xhtml section in the ePub to that of the filename
        Filename = fHandle.name
        head, fName = ntpath.split(fHandle.name)
        ChapterName = fName[:fName.index(".")]  #Remove extension

        #Check whether this file already exists in the ePub
        for (id, href) in self.bk.text_iter():
            if id == ChapterName or href == 'Text/' + ChapterName + '.xhtml':  #If the section already exists
                reply = tkMessageBox.askquestion(
                    "WARNING",
                    "Do you want to delete the current page named " +
                    ChapterName + ".xhtml?")
                if reply == "yes":  #and it is not wanted
                    bk.deletefile(id)  #then delete it
                else:  #otherwise do not import the text file
                    print("Present xhtml page has been retained.")
                    return

        #Add text file to ePub in a new xhtml section
        uid = ChapterName
        basename = uid + '.xhtml'
        mime = 'application/xhtml+xml'
        self.bk.addfile(uid, basename, xml, mime)

Example #17

0

Show file

def run(bk):
    # set Tk parameters for dialog box
    root = Tk()
    root.geometry("320x200+400+400")
    app = Dialog(root, bk)
    if not isosx:
        icon_img = PhotoImage(file=os.path.join(
            bk._w.plugin_dir, bk._w.plugin_name, 'sigil.png'))
        root.tk.call('wm', 'iconphoto', root._w, icon_img)
    root.mainloop()

    if Cancel == True:
        print(
            'Plugin terminated by user.\nPlease click OK to close the Plugin Runner window.'
        )
        return -1

    # --------------------------------------
    # get preferences
    # --------------------------------------
    prefs = bk.getPrefs()

    # id prefix for <sup> footnote anchors
    fnanchor_id = prefs['anchorid']

    # id prefix for <p> footnote definitions
    fndef_id = prefs['fndefid']

    # class for <a> backlink numbers in footnote definitions file
    backlink_class = prefs['backlink']

    kindle_compat = prefs['kindle']
    ibooks_compat = prefs['ibooks']
    notesource = prefs['notesource']

    # debug mode
    if 'debug' not in prefs:
        prefs['debug'] = False
        bk.savePrefs(prefs)
    debug = prefs['debug']

    # get epub version number
    if bk.launcher_version() >= 20160102:
        epubversion = bk.epub_version()
    else:
        epubversion = BeautifulSoup(bk.get_opf(),
                                    'lxml').find('package')['version']

    # -------------------------
    # footnote linking process
    # -------------------------
    template_anchor = '''<a class="duokan-footnote" href="#{fndef_id}{id}" id="{fnanchor_id}{id}"><img alt="" src="../Images/note.png"/></a>'''
    template_def = '''
	  <li class="duokan-footnote-item" id="{fndef_id}{id}">
		<a class="{backlink_class}" href="#{fnanchor_id}{id}">◎</a>{text}</li>\n</ol>'''
    if kindle_compat and ibooks_compat:
        template_anchor = '''<a style="text-decoration:none!important;color:black;" class="duokan-footnote" epub:type="noteref" href="#{fndef_id}{id}" id="{fnanchor_id}{id}"><img alt="" src="../Images/note.png"/></a>'''
        template_def = '''
	  <li class="duokan-footnote-item" id="{fndef_id}{id}">
		<p><a class="{backlink_class}" style="text-decoration:none!important;color:black;" href="#{fnanchor_id}{id}">◎</a>{text}</p></li>\n</ol>'''
    else:
        if kindle_compat:
            template_anchor = '''<a style="text-decoration:none!important;color:black;" class="duokan-footnote" href="#{fndef_id}{id}" id="{fnanchor_id}{id}"><img alt="" src="../Images/note.png"/></a>'''
            template_def = '''
		  <li class="duokan-footnote-item" id="{fndef_id}{id}">
			<p><a class="{backlink_class}" style="text-decoration:none!important;color:black;" href="#{fnanchor_id}{id}">◎</a>{text}</p></li>\n</ol>'''
        if ibooks_compat:
            template_anchor = '''<a class="duokan-footnote" epub:type="noteref" href="#{fndef_id}{id}" id="{fnanchor_id}{id}"><img alt="" src="../Images/note.png"/></a>'''
            template_def = '''
		  <li class="duokan-footnote-item" id="{fndef_id}{id}">
			<a class="{backlink_class}" style="color:black;" href="#{fnanchor_id}{id}">◎</a>{text}</li>\n</ol>'''

    anchor_count = 0
    def_count = 0
    pattern_anchor = re.compile(r'(?<!<p>)\[\d+\]')
    pattern_def = re.compile(r'\<p\>\[\d+\](.+)\<\/p\>')

    # validate note source
    note_html = None
    note_html_original = note_html
    if notesource:
        if not notesource.startswith('Text/'):
            notesource = 'Text/' + notesource
        temp_list = [
            opf_href for (manifest_id, linear, opf_href) in bk.spine_iter()
        ]
        if notesource in temp_list:
            iter_list = [(manifest_id, linear, opf_href)
                         for (manifest_id, linear,
                              opf_href) in bk.spine_iter()
                         if opf_href != notesource]
            note_html = bk.readfile(bk.href_to_id(notesource))
    else:
        iter_list = list(bk.spine_iter())

    for (manifest_id, linear, opf_href) in iter_list:
        print('-' * 20, opf_href, '-' * 20)
        html = bk.readfile(manifest_id)
        html_original = html

        note_anchor = re.search(pattern_anchor, html)
        if note_anchor is not None:  # only once for each file with notes
            html = re.sub(
                r'\<\/head\>',
                r'<link href="../Styles/footnote.css" rel="stylesheet" type="text/css"/>\n</head>',
                html,
            )

            if ibooks_compat:
                html = re.sub(
                    r'\<\/body\>',
                    r'<aside epub:type="footnote">\n<ol class="duokan-footnote-content">\n</ol>\n</aside>\n</body>',
                    html,
                )
                soup = BeautifulSoup(html, 'html.parser')
                soup.html['xmlns:epub'] = 'http://www.idpf.org/2007/ops'
                bk.writefile(manifest_id, str(soup))
                del soup
                # update html string
                html = bk.readfile(manifest_id)
                html_original = html
            else:
                html = re.sub(
                    r'\<\/body\>',
                    r'<ol class="duokan-footnote-content">\n</ol>\n</body>',
                    html,
                )

            local_count = 0
            while note_anchor is not None:
                anchor_count = anchor_count + 1
                local_count += 1
                template = template_anchor.format(id=anchor_count,
                                                  fnanchor_id=fnanchor_id,
                                                  fndef_id=fndef_id)
                html = re.sub(pattern_anchor, template, html, 1)
                print('Anchor No.' + str(anchor_count) + ': ' +
                      note_anchor.group(0).strip('[]^'))
                note_anchor = re.search(pattern_anchor, html)

            if note_html:
                note_def = re.search(pattern_def, note_html)
                for i in range(1, local_count + 1):
                    def_count = def_count + 1
                    note_html = re.sub(pattern_def, r'', note_html, 1)
                    template = template_def.format(
                        id=def_count,
                        text=note_def.group(1).strip('[]^'),
                        fnanchor_id=fnanchor_id,
                        fndef_id=fndef_id,
                        backlink_class=backlink_class,
                    )
                    html = re.sub(r'\<\/ol\>', template, html, 1)
                    print('Note No.' + str(def_count) + ': ' +
                          note_def.group(1))
                    note_def = re.search(pattern_def, note_html)
            else:
                note_def = re.search(pattern_def, html)
                while note_def is not None:
                    def_count = def_count + 1
                    html = re.sub(pattern_def, r'', html, 1)
                    template = template_def.format(
                        id=def_count,
                        text=note_def.group(1).strip('[]^'),
                        fnanchor_id=fnanchor_id,
                        fndef_id=fndef_id,
                        backlink_class=backlink_class,
                    )
                    html = re.sub(r'\<\/ol\>', template, html, 1)
                    print('Note No.' + str(def_count) + ': ' +
                          note_def.group(1))
                    note_def = re.search(pattern_def, html)
        else:
            print("No note is found")

        if not html == html_original:
            bk.writefile(manifest_id, html)

    if not note_html == note_html_original:
        bk.writefile(bk.href_to_id(notesource), note_html)
        print('\nInfo: Remember to delete footnote source file %s manually.' %
              notesource)

    insert_note_css(bk, backlink_class=backlink_class)

    print(
        "\nInfo: Footnote generation succeeded, after which you'd better beautify all text files."
    )
    return 0

Example #18

0

Show file

File: xmlprocessor.py Project: uchuugaka/Sigil

def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars="  "):
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=self_closing_tags)
    soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
    return newdata

Example #19

0

Show file

File: plugin.py Project: ovictorsoares/BooknandoPageList

def run(bk):
    # get epub version number
    if bk.launcher_version() >= 20160102:
        epubversion = bk.epub_version()
    else:
        epubversion = BeautifulSoup(bk.get_opf(), 'lxml').find('package')['version']

    # get preferences
    prefs = bk.getPrefs()
    if prefs == {}:
        prefs['tag'] = 'span'
        prefs['attribute'] = 'epub:type'
        prefs['value'] = 'pagebreak'
        bk.savePrefs(prefs)
        prefs = bk.getPrefs()
    tag = prefs['tag']
    attribute = prefs['attribute']
    value = prefs['value']

    # get nav doc and toc.ncx ids
    nav_id = ncx_id = None
    ncx_id = bk.gettocid()

    if epubversion.startswith('3'):
        opf_soup = BeautifulSoup(bk.get_opf(), 'lxml')
        if opf_soup.find('item', {'properties' : 'nav'}) is not None:
            nav_id = opf_soup.find('item', {'properties' : 'nav'})['id']
        else:
            print('Nav document ID not found!')
                
    ncx_pagelist = '\n  <pageList>\n    <navLabel>\n      <text>Pages</text>\n    </navLabel>'
    nav_pagelist = '    <nav epub:type="page-list" id="page-list">\n      <ol>\n'
    page_targets = 0
       

    substitutePageNum(bk)


    # get all html files
    page_dic = {}
    errors = 0
    for (html_id, href) in bk.text_iter():
        html = bk.readfile(html_id)
    
        
        # load html code into BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # find pagebreaks
        page_numbers = soup.find_all(tag, {attribute : value})
        if not page_numbers:
            print('\nNo page number targets found in ' + os.path.basename(href))
        else:
            page_targets += len(page_numbers)
            print('\n' + str(len(page_numbers)) + ' page number targets found in ' + os.path.basename(href))
        
        # add pagelist entries to pagelist
        for page_number in page_numbers:
            
            # title has priority over string
            if page_number.has_attr('title'):
                title = page_number['title']
            else:
                title = page_number.contents[0]

            # generate id, if necessary
            if not page_number.has_attr('id'):
                id = 'page' + title
            id = page_number['id']

            # check for duplicate titles/ids
            if title not in page_dic:
                page_dic[title] = os.path.basename(href + '#' + id)
            else:
                errors += 1
                page_dic[title] += ' / ' + os.path.basename(href + '#' + id)
                print('ERROR: duplicate page number found:', title, page_dic[title])
            
            # epub2
            ncx_pagelist += '''\n    <pageTarget id="{}" type="normal" value="{}">
      <navLabel>
        <text>{}</text>
      </navLabel>
      <content src="{}"/>
    </pageTarget>'''.format(id, title, title, href + '#' + id)
            
            # epub3
            if nav_id:
                nav_pagelist += '        <li>\n          <a href="{}">{}</a>\n        </li>\n'.format('../' + href + '#' + id, title)
    
    if errors != 0:
        print('Plugin aborted because of {} duplicate page number(s).'.format(str(errors)))
        return -1
    
    # add/replace NCX pagelist section
    if page_targets:
        ncx_pagelist += '\n  </pageList>'
        if ncx_id: 
            # get ncx contents
            ncx = bk.readfile(ncx_id)
            # delete existing pagelist
            ncx = re.sub('\s*\<pageList[^>]*\>.+?\<\/pageList\>\s*', '', ncx, flags = re.DOTALL)
            # add new pagelist
            ncx = ncx.replace('</ncx>', ncx_pagelist + '\n</ncx>')
            # update ncx file
            bk.writefile(ncx_id, ncx)
            print('\n' + str(page_targets) + ' page number targets found.\nNCX file updated. ')
        else:
            print('\nNCX file couldn\'t be found and updated.')
    else:
        print('\nNo page number targets found.\nNCX file not updated')

    # add/replace NAV pagelist section
    if nav_id:
        nav_pagelist += '      </ol>\n    </nav>'
        new_pagelist = BeautifulSoup(nav_pagelist, 'html.parser')
        # get nav contents
        nav = bk.readfile(nav_id)
        nav_soup = BeautifulSoup(nav, 'html.parser')
        orig_nav_soup = str(nav_soup)
        old_page_list = nav_soup.find('nav', {'epub:type' : 'page-list'})
        if old_page_list is not None:
            old_page_list.replace_with(new_pagelist)
            #print('Existing page-list updated.')
        else:
            nav_soup.body.insert(2, new_pagelist)
            #print('New page-list section added.')
        # update nav
        if str(nav_soup) != orig_nav_soup:
            try:
                bk.writefile(nav_id, str(nav_soup.prettyprint_xhtml(indent_level=0, eventual_encoding="utf-8", formatter="minimal", indent_chars="  ")))
            except:
                bk.writefile(nav_id, str(nav_soup))
            print('NAV file updated.')
        else:
            print('NAV NOT file updated.')
            
    print('\nPlease click OK to close the Plugin Runner window.')

    return 0