Example #1
0
def delete_xhtml_attributes(bk, attributes: dict,
                            prefs: MutableMapping) -> None:
    for xhtml_id, xhtml_href in bk.text_iter():
        if prefs['parse_only_selected_files'] and xhtml_href not in prefs[
                'selected_files']:
            continue
        soup = gumbo_bs4.parse(bk.readfile(xhtml_id))
        for elem in soup.find_all(True):
            try:
                if elem['id'] in attributes['ids']:
                    del elem['id']
            except KeyError:
                pass
            classes = elem.get('class', [])
            if isinstance(classes, str):
                classes = [classes]
            for class_ in classes.copy():
                if class_ in attributes['classes']:
                    try:
                        elem['class'].remove(class_)
                    except AttributeError:
                        del elem['class']  # this should never raise a KeyError
            # I don't know if it's linked to python, sigil, beautifulsoup or gumbo versions:
            # with some installation the elements keep empty class attributes.
            try:
                if not classes:
                    del elem['class']
            except KeyError:
                pass
        bk.writefile(xhtml_id, soup.serialize_xhtml())
Example #2
0
    def remove_mo_attributes(self, data, remove_class=True, remove_id=True):
        """
        Remove MO attributes to tags in the given XHTML file,
        and return the resulting XHTML string.

        :param data: the source code
        :type  data: str
        :param remove_class: remove the MO class attribute
        :type  remove_class: bool
        :param remove_id: remove the MO id attribute
        :type  remove_id: bool
        """
        msgs = []
        if (not remove_class) and (not remove_id):
            return (msgs, data)
        import sigil_gumbo_bs4_adapter as gumbo_bs4
        soup = gumbo_bs4.parse(data)
        for node in soup.find_all():
            if node.name in self.tags:
                if self.has_mo_class(node):
                    if remove_class:
                        self.remove_mo_class(node)
                        msgs.append(("INFO", "removed class 'mo' from element '%s'" % (node.name)))
                    if remove_id:
                        if (self.existing_ids_only) and (self.has_mo_id(node)):
                            msgs.append(("WARN", "element '%s' with MO id '%s' => not removing" % (node.name, node.attrs["id"])))
                        elif self.has_mo_id(node):
                            old_id = node.attrs["id"]
                            self.remove_id_attribute(node)
                            msgs.append(("INFO", "removed id '%s' from element '%s'" % (old_id, node.name)))
                        elif self.has_id_not_mo(node):
                            msgs.append(("WARN", "element '%s' with id '%s' => not removing" % (node.name, node.attrs["id"])))
        out_data = self.output_xhtml_code(soup) 
        return (msgs, out_data)
Example #3
0
def build_html(fragment, css=False):
    fragment = regex.sub(r'<p([^>]*)></p>', r'<p\1>&#160;</p>', fragment)
    css_link = ''
    if css:
        css_link = LINK_TEXT
    new = HTML.format(css_link, fragment)
    soup = gumbo_bs4.parse(new)
    return soup.serialize_xhtml()
def build_html(fragment, css=False):
    fragment = regex.sub(r'<p([^>]*)></p>', r'<p\1>&#160;</p>', fragment)
    css_link = ''
    if css:
        css_link = LINK_TEXT
    new = HTML.format(css_link, fragment)
    soup = gumbo_bs4.parse(new)
    return soup.serialize_xhtml()
    def makeEPUB(self):
        out_enc = find_output_encoding(self.opffile)
        print('Markup encoded as:', out_enc)
        ml2html = MobiMLConverter(self.htmlfile, out_enc)
        xhtmlstr, css, cssname = ml2html.processml()
        soup = gumbo_bs4.parse(xhtmlstr)
        xhtmlstr = soup.prettyprint_xhtml()
        file_open(self.htmlfile, 'wb').write(xhtmlstr.encode('utf-8'))
        if has_cssutils:
            sheet = cssutils.parseString(css)
            cssutils.ser.prefs.indent = 2 * ' '
            cssutils.ser.prefs.indentClosingBrace = False
            cssutils.ser.prefs.omitLastSemicolon = False
            css = unicode_str(sheet.cssText)
        file_open(cssname, 'wb').write(css.encode('utf-8'))

        with file_open(self.opffile, 'r', encoding='utf-8') as fp:
            newopf = ''
            for line in fp:
                if line.startswith('<item'):
                    if line.find('text/x-oeb1-document'):
                        line = line.replace('text/x-oeb1-document',
                                            'application/xhtml+xml')
                    if line.find('text/html'):
                        line = line.replace('text/html',
                                            'application/xhtml+xml')
                newopf += line
                if line.startswith('<manifest>'):
                    newopf += '<item id="css_file" media-type="text/css" href="styles.css" />\n'

        file_open(self.opffile, 'wb').write(newopf.encode('utf-8'))

        outzip = zipfile.ZipFile(self.epubname, 'w')

        # add the mimetype file uncompressed
        mimetype = 'application/epub+zip'
        fileout = os.path.join(self.outdir, 'mimetype')
        file_open(fileout, 'wb').write(mimetype.encode('utf-8'))
        nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
        outzip.writestr(nzinfo, mimetype)

        self.zipUpDir(outzip, self.outdir, 'META-INF')
        if os.path.exists(os.path.join(self.outdir, 'Images')):
            self.removeThumbnailImage(os.path.join(self.outdir, 'Images'))
            self.zipUpDir(outzip, self.outdir, 'Images')

        outzip.write(self.htmlfile, os.path.basename(self.htmlfile),
                     zipfile.ZIP_DEFLATED)
        outzip.write(self.opffile, os.path.basename(self.opffile),
                     zipfile.ZIP_DEFLATED)
        outzip.write(cssname, 'styles.css', zipfile.ZIP_DEFLATED)

        if os.path.exists(os.path.join(self.outdir, 'toc.ncx')):
            outzip.write(os.path.join(self.outdir, 'toc.ncx'), 'toc.ncx',
                         zipfile.ZIP_DEFLATED)
        outzip.close()
        return self.epubname
Example #6
0
 def create_dummy_smil_file(self, t_href, t_mid, a_href, smil_mid):
     """
     This function is not currently used.
     """
     import sigil_gumbo_bs4_adapter as gumbo_bs4
     ret = None
     xhtml_data = self.bk.readfile(t_mid).encode("utf-8")
     soup = gumbo_bs4.parse(xhtml_data)
     attributes = {
         "class": re.compile(r".*\b" + self.prefs["mo_class"] + r"\b.*"),
         "id": re.compile(r".*\b" + self.prefs["id_regex"] + r"\b.*")
     }
     s_ids = [node.attrs["id"] for node in soup.find_all(attrs=attributes)]
     if len(s_ids) > 0:
         s_name = self.smil_name_from_t_href(t_href)
         s_href = os.path.join(self.SMIL_DIRECTORY, s_name)
         mid = self.bk.href_to_id(s_href)
         if mid is not None:
             print("INFO: file '%s' exists, removing it" % (s_href))
             self.bk.deletefile(mid)
         i = 1
         data = []
         data.append(self.SMIL_HEADER % (t_href))
         for s_id in s_ids:
             p_id = "%06d" % (i)
             i += 1
             data.append(self.SMIL_ROW %
                         (p_id, t_href, s_id, "0.000", "0.000", a_href))
         data.append(self.SMIL_FOOTER)
         data = ("\n".join(data)).encode("utf-8")
         self.bk.addfile(smil_mid,
                         s_name,
                         data,
                         mime="application/smil+xml",
                         properties=None)
         print("INFO: created file '%s'" % (s_href))
         ret = s_href
     else:
         print("ERROR: no SMIL elements in file '%s'" % (t_href))
         ret = None
     print()
     return ret
Example #7
0
 def create_dummy_smil_file(self, t_href, t_mid, a_href, smil_mid):
     """
     This function is not currently used.
     """
     import sigil_gumbo_bs4_adapter as gumbo_bs4
     ret = None
     xhtml_data = self.bk.readfile(t_mid).encode("utf-8")
     soup = gumbo_bs4.parse(xhtml_data)
     attributes = {
         "class": re.compile(r".*\b" + self.prefs["mo_class"] + r"\b.*"),
         "id": re.compile(r".*\b" + self.prefs["id_regex"] + r"\b.*")
     }
     s_ids = [node.attrs["id"] for node in soup.find_all(attrs=attributes)]
     if len(s_ids) > 0:
         s_name = self.smil_name_from_t_href(t_href)
         s_href = os.path.join(self.SMIL_DIRECTORY, s_name)
         mid = self.bk.href_to_id(s_href)
         if mid is not None:
             print("INFO: file '%s' exists, removing it" % (s_href))
             self.bk.deletefile(mid)
         i = 1
         data = []
         data.append(self.SMIL_HEADER % (t_href))
         for s_id in s_ids:
             p_id = "%06d" % (i)
             i += 1
             data.append(self.SMIL_ROW % (p_id, t_href, s_id, "0.000", "0.000", a_href))
         data.append(self.SMIL_FOOTER)
         data = ("\n".join(data)).encode("utf-8")
         self.bk.addfile(smil_mid, s_name, data, mime="application/smil+xml", properties=None)
         print("INFO: created file '%s'" % (s_href))
         ret = s_href
     else:
         print("ERROR: no SMIL elements in file '%s'" % (t_href))
         ret = None
     print()
     return ret
Example #8
0
    def add_mo_attributes(self, data):
        """
        Add MO attributes to tags in the given XHTML file,
        and return the resulting XHTML string.

        :param data: the source code
        :type  data: str
        :rtype: str
        """
        import sigil_gumbo_bs4_adapter as gumbo_bs4
        msgs = []
        soup = gumbo_bs4.parse(data)
        i = 1
        for node in soup.find_all():
            if node.name in self.tags:
                new_id = self.id_format % (i)
                i += 1
                if self.has_nomo_class(node):
                    msgs.append(("WARN", "element '%s' with class 'nomo' => ignoring (it would be '%s')" % (node.name, new_id)))
                else:
                    add = True
                    if self.existing_ids_only:
                        if self.has_mo_id(node):
                            msgs.append(("INFO", "element '%s' with MO id '%s' => adding class '%s'" % (node.name, node.attrs["id"], self.mo_class)))
                        else:
                            msgs.append(("WARN", "element '%s' without MO id => not adding class '%s'" % (node.name, self.mo_class)))
                            add = False
                    elif self.has_id_not_mo(node):
                        msgs.append(("WARN", "element '%s' with id '%s' => not changing (it would be '%s')" % (node.name, node.attrs["id"], new_id)))
                    else:
                        msgs.append(("INFO", "element '%s' => setting id '%s'" % (node.name, new_id)))
                        node.attrs["id"] = new_id
                    if add:
                        self.add_mo_class(node)
        out_data = self.output_xhtml_code(soup)
        return (msgs, out_data)
Example #9
0
def processMainText(bk):
	altReadingCount = 0
	def altReadingReplace(matchobj):
		nonlocal altReadingCount
		altReadingCount += 1
		print('Correcting alternative reading: "%s" | "%s"' % (matchobj.group(1).strip(), matchobj.group(2).strip())) # note: 1 is displayed on top of 2
		return '<span style="white-space: nowrap; position: relative;"><span style="position: absolute; font-size: .8em; top: -15px; left: 50%; white-space: nowrap; letter-spacing: normal; color: inherit; font-weight: inherit; font-style: inherit;"><span style="position: relative; left: -50%;">\1</span></span><span style="display: inline-block; color: inherit; letter-spacing: normal; font-size: 1.0em; font-weight: inherit;">\2</span></span>'.replace('\1', matchobj.group(1).strip()).replace('\2', matchobj.group(2).strip())
	def altReadingReplaceRuby(matchobj):
		nonlocal altReadingCount
		altReadingCount += 1
		print('Converting alternative reading: "%s" | "%s"' % (matchobj.group(1).strip(), matchobj.group(2).strip())) # note: 2 is displayed on top of 1
		return '<span style="white-space: nowrap; position: relative;"><span style="position: absolute; font-size: .8em; top: -15px; left: 50%; white-space: nowrap; letter-spacing: normal; color: inherit; font-weight: inherit; font-style: inherit;"><span style="position: relative; left: -50%;">\2</span></span><span style="display: inline-block; color: inherit; letter-spacing: normal; font-size: 1.0em; font-weight: inherit;">\1</span></span>'.replace('\1', matchobj.group(1).strip()).replace('\2', matchobj.group(2).strip())

	bookTitle = 'Untitled'
	galleryImages = []
	mainText = []
	suggestedFilenames = []
	for (textID, textHref) in bk.text_iter():
		if os.path.split(textHref)[1] in ['Cover.xhtml', 'Section0001.xhtml', 'Illustrations.xhtml']: # main text file is anything but these
			continue
		print('\nProcessing text file: %s' % textHref)
		suggestedFilenames.append('%s[bke_v%s_passed].epub' % (os.path.splitext(os.path.basename(textHref))[0], plugin_version))

		html = bk.readfile(textID) # Read the section into html
		if not isinstance(html, text_type):	# If the section is not str then sets its type to 'utf-8'
			html = text_type(html, 'utf-8')

		plsWriteBack = False

		# unwrap heading from <h1><span id='blabl'>text</span></h1> into <h1 id='blal'>text<h1>.
		# class="mw-headline" part are all removed by ebook converter
		html = re.sub('<h(\\d)><span id="(.+?)">(.+?)</span></h(\\d)>', '<h\\1 id="\\2">\\3</h\\4>', html)

		soup = gumbo_bs4.parse(html)

		# remove lang="en" attribute from <html> tag (FlightCrew complains)
		for htmlTag in soup.find_all('html'):
			if htmlTag.get('lang') != None:
				del htmlTag['lang']
				plsWriteBack = True

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# move up headings if necessary
		headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8']
		lvToMvUp = 0
		for lv in headingLv:
			tags = soup.find_all(lv)
			# print('%s = %r' % (lv, tags))
			if len(tags) == 0:
				lvToMvUp += 1
			else:
				break
		if lvToMvUp > 0:
			print('Moving headings up %d level(s).' % lvToMvUp)
			for i in range(lvToMvUp, len(headingLv)):
				for tag in soup.find_all(headingLv[i]):
					tag.name = headingLv[i-lvToMvUp]
			plsWriteBack = True

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# add Id to headings missing it
		idFixedCount = 0
		for headingTag in soup.find_all(headingLv):
			if not headingTag.get('id'):
				headingTag['id'] = 'id-' + str(uuid.uuid4())
				idFixedCount += 1
		if idFixedCount > 0:
			plsWriteBack = True
			print('Added ID attribute to %d heading(s).' % idFixedCount)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# convert name attribute into id in <a> tag
		tagsFixedCount = 0
		for anchorTag in soup.find_all(['a']):
			if anchorTag.has_attr('name'):
				anchorTag['id'] = anchorTag['name']
				del anchorTag['name']
				tagsFixedCount += 1
		if tagsFixedCount > 0:
			print('Converted %d `name` attribute into `id` in <a> tag(s).' % tagsFixedCount)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# originally for correcting multiple T/N sections with identical IDs (all starts from 1) in krytykal source
		# now it corrects ALL duplicated and invalid IDs
		idCorrected = correctDuplicateOrInvalidID(bk, soup)
		if idCorrected > 0:
			print('Corrected %d duplicated/invalid IDs and their corresponding anchors (if any).' % idCorrected)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# strip all formatings from headings as BTE-GEN does
		headingStrippedCount = 0
		for lv in headingLv:
			for headingTag in soup.find_all(lv):
				if len(headingTag.find_all('img')) == 0 and (len(headingTag.find_all(True)) > 0 or headingTag.get('style')):
					headingTag.string = headingTag.get_text()
					del headingTag['style']
					headingStrippedCount += 1
		if headingStrippedCount > 0:
			plsWriteBack = True
			print('Stripped formatings from %d headings to match BTE-GEN\'s behavior.' % headingStrippedCount)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# handle the invalid usage of <i> tags in HakoMari vol 2 may 2. This is due to a major error in the source page, but it can't be helped.
		# also stuff here https://baka-tsuki.org/project/index.php?title=User_talk:Dreamer2908
		# ref http://www.w3schools.com/html/html_formatting.asp
		tagsFixedCount = 0
		tag2Css =  {
					'b':'font-weight: bold;',
					'strong':'font-weight: bold;',
					'i':'font-style: italic;',
					'em':'font-style: italic;',
					'big':'font-size: large',
					'small':'font-size: smaller',
					'mark':'background-color: yellow; color: black;',
					's':'text-decoration: line-through;',
					'strike':'text-decoration: line-through;',
					'del':'text-decoration: line-through;',
					'ins':'text-decoration: underline;',
					'sub':'vertical-align: sub; font-size: smaller;',
					'sup':'vertical-align: super; font-size: smaller;',
					'u':'text-decoration: underline;',
					}
		for iTag in soup.find_all(['b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup', 'u']):
			illegalChild = iTag.find_all(['p', 'div', 'table', 'blockquote', 'pre', 'caption', 'dl', 'hr', 'section', 'ul', 'ol'] + headingLv)
			if len(illegalChild) > 0:
				tagsFixedCount += 1
				for child in iTag.children:
					if type(child) == sigil_bs4.element.NavigableString:
						# a lot of unwanted `<p><i> </i></p>` line will be created if you wrap everything without checking
						if str(child).strip() != '':
							wrapper = child.wrap(soup.new_tag(iTag.name))
							wrapper.wrap(soup.new_tag('p'))
					elif child.name == 'p':
						for grandChild in child.children:
							if type(grandChild) == sigil_bs4.element.Tag:
								if grandChild.name == iTag.name:
									grandChild.unwrap() # remove italic from italic text
								else:
									grandChild.wrap(soup.new_tag(iTag.name))
							else:
								grandChild.wrap(soup.new_tag(iTag.name))
					elif child.name not in headingLv: # skip styling headings
						styleAttr = child.get('style')
						if styleAttr:
							child['style'] = tag2Css[iTag.name] + styleAttr
						else:
							child['style'] = tag2Css[iTag.name]
				iTag.unwrap()

		if tagsFixedCount > 0:
			print('Fixed %d range of invalid usage of text formatting tags (i/b/u/etc.)' % tagsFixedCount)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# wrap phantom (direct decendant of body) <br>/<span>/<a>, text formatting tags and text in <p> (krytykal/skythewood/imoutolicious source)
		phantomWrapped = 0
		removeMe = []
		plsWriteBack = True
		for child in soup.body.contents:
			if type(child) == sigil_bs4.element.NavigableString:
				# a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking
				if str(child).strip() != '':
					child.wrap(soup.new_tag('p'))['class'] = 'baka_epub_phantom_elements'
					phantomWrapped += 1
				else:
					child.replace_with('\n') # eliminate blank phantom texts that aren't newline or true white spaces
			elif type(child) == sigil_bs4.element.Tag:
				if child.name in ['br', 'a']:
					child.wrap(soup.new_tag('p'))['class'] = 'baka_epub_phantom_elements'
					phantomWrapped += 1
				elif child.name in ['span', 'b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup', 'u']:
					# for these, check if they have some contents. remove if no
					if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0):
						child.wrap(soup.new_tag('p'))['class'] = 'baka_epub_phantom_elements'
					else:
						removeMe.append(child)
					phantomWrapped += 1
		if phantomWrapped > 0:
			for element in removeMe:
				element.decompose()
			print('Wrapped %d phantom <br>/<span>/<a>, text formatting tags and texts in <p>.' % phantomWrapped)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# handle the long deprecated center tags
		tagsFixedCount = 0
		for centerTag in soup.find_all('center'):
			if centerTag.parent.name == 'p':
				styleAttr = centerTag.parent.get('style')
				if styleAttr:
					centerTag.parent['style'] = 'text-align: center; ' + styleAttr
				else:
					centerTag.parent['style'] = 'text-align: center;'
				centerTag.unwrap()
			else:
				centerTag.name = 'div'
				centerTag['style'] = 'text-align: center;'
			tagsFixedCount += 1
		if tagsFixedCount > 0:
			plsWriteBack = True
			print('Converted %d deprecated center tag(s) into a suitable form for ePub.' % tagsFixedCount)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# handle the deprecated u tags
		tagsFixedCount = 0
		for uTag in soup.find_all('u'):
			uTag.name = 'span'
			uTag['style'] = 'text-decoration: underline;'
			tagsFixedCount += 1
		if tagsFixedCount > 0:
			plsWriteBack = True
			print('Converted %d deprecated u tag(s) into a suitable form for ePub.' % tagsFixedCount)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# handle the deprecated tag <s> and <strike>. Use <del> instead.
		tagsFixedCount = 0
		for strikeTag in soup.find_all(['s', 'strike']):
			strikeTag.name = 'del'
			tagsFixedCount += 1
		if tagsFixedCount > 0:
			plsWriteBack = True
			print('Converted %d deprecated <s> and <strike> tag(s) into <del> tag(s).' % tagsFixedCount)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# remove all data-* attributes from tags
		tagsFixedCount = 0
		for buggyTag in soup.find_all(True):
			attrDel = 0
			for attr in list(buggyTag.attrs.keys()):
				if attr.startswith('data-'):
					del buggyTag[attr]
					attrDel += 1
				elif attr == 'itemprop':
					del buggyTag[attr]
					attrDel += 1
				elif attr == 'target':
					del buggyTag[attr]
					attrDel += 1
			if attrDel > 0:
				tagsFixedCount += 1
		if tagsFixedCount > 0:
			plsWriteBack = True
			print('Removed itemprop/data-*/target attribute(s) from %d tag(s).' % tagsFixedCount)
		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# handle align attribute in p, div, span
		tagsFixedCount = 0
		for pdivspanTag in soup.find_all(['p', 'div', 'span', 'caption', 'img', 'table'] + headingLv):
			alignAttr = pdivspanTag.get('align')
			if alignAttr != None:
				styleAttr = pdivspanTag.get('style')
				if styleAttr:
					pdivspanTag['style'] = 'text-align: %s; ' % alignAttr + styleAttr
				else:
					pdivspanTag['style'] = 'text-align: %s;' % alignAttr
				del pdivspanTag['align']
				tagsFixedCount += 1
		if tagsFixedCount > 0:
			print('Converted align attribute in %d p/div/span tag(s) into css style.' % tagsFixedCount)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# remove  align/noshade/size/width attributes from <hr> tags
		tagsFixedCount = 0
		for buggyTag in soup.find_all('hr'):
			attrDel = 0
			for attr in list(buggyTag.attrs.keys()):
				if attr in ['align', 'noshade', 'size', 'width']:
					del buggyTag[attr]
					attrDel += 1
			if attrDel > 0:
				tagsFixedCount += 1
		if tagsFixedCount > 0:
			print('Removed all deprecated attributes from %d <hr> tag(s).' % tagsFixedCount)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# remove all but global attribute from br tag
		# event attributes are allowed, but there's no point in such attributes in epub
		globalAttributes = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable', 'dropzone', 'hidden', 'id', 'lang', 'spellcheck', 'style', 'tabindex', 'title', 'translate']
		tagsFixedCount = 0
		for buggyTag in soup.find_all('br'):
			attrDel = 0
			for attr in list(buggyTag.attrs.keys()):
				if attr not in globalAttributes:
					del buggyTag[attr]
					attrDel += 1
			if attrDel > 0:
				tagsFixedCount += 1
		if tagsFixedCount > 0:
			print('Removed all invalid attributes from %d <br> tag(s).' % tagsFixedCount)
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# apply that certain customization to Baka-Tsuki's alternative reading style
		altReadingCustomized = 0
		for spanTag in soup.find_all('span'):
			styleAttr = spanTag.get('style')
			if (styleAttr and (styleAttr.replace(' ', '').startswith("position: absolute; font-size: .8em; top: -11px;".replace(' ', '')))):
				spanTag['style'] = styleAttr.replace('-11px', '-15px')
				altReadingCustomized += 1
		if altReadingCustomized > 0:
			plsWriteBack = True
			print('Customized Baka-Tsuki\'s style in %d alternative reading(s).' % altReadingCustomized)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# remove all "Status: Incomplete" messages
		# signatures:
		# + <div style="width:25%; border:10px solid white; clear:both; float:right; text-align:center;">
		# + <b>Status: Incomplete</b>
		# + <div style="clear:both; {{#ifeq: yes | yes | margin:auto; text-align:center;">
		removeMe = []
		for divTag in soup.find_all('div'):
			hasWidth25percent = False
			hasStatusIncompleteMsg = False
			hasFaultyCssStyle = False

			styleAttr = divTag.get('style')
			if styleAttr and ('width:25%;' in re.sub('\s', '', styleAttr)):
				hasWidth25percent = True

			bTags = divTag.find_all('b')
			subDivTags = divTag.find_all('div')
			for bTag in bTags:
				if bTag.get_text().strip() == 'Status: Incomplete':
					hasStatusIncompleteMsg = True
					break

			for subDivTag in subDivTags:
				styleAttr = subDivTag.get('style')
				if (styleAttr and ('{{#ifeq: yes | yes | margin:auto;' in styleAttr)):
					hasFaultyCssStyle = True
					break
			if hasWidth25percent and hasStatusIncompleteMsg and hasFaultyCssStyle:
				removeMe.append(divTag)

		if len(removeMe) > 0:
			plsWriteBack = True
			for garbage in removeMe:
				# print(garbage)
				garbage.decompose()
			print('Removed %d "Status: Incomplete" message(s).' % len(removeMe))

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# fix the invalid css code in the "Status: Incomplete" message
		invalidCssCodeFixed = 0
		for divTag in soup.find_all('div'):
			styleAttr = divTag.get('style')
			if (styleAttr and ('{{#ifeq: yes | yes | margin:auto;' in styleAttr)):
				divTag['style'] = styleAttr.replace('{{#ifeq: yes | yes | margin:auto;', '/*! {{#ifeq: yes | yes | margin:auto; */')
				invalidCssCodeFixed += 1
		if invalidCssCodeFixed > 0:
			plsWriteBack = True
			print('Removed invalid CSS code in %d "Status: Incomplete" message(s).' % invalidCssCodeFixed)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# remove the navigator at the end. How to detect: the last table, containing all baka-tsuki.org links. An automatic and simple navigator should contain only a single table. A customized navigator might contain several nested tables. Kill the biggest one together with everything inside.
		allTables = soup.find_all('table')
		if len(allTables) > 0:
			tableTag = allTables[-1]
			if tableTag:
				for tmpTag in tableTag.parents: # reach the highest level of table
					if tmpTag is not None and tmpTag.name == 'table':
						tableTag = tmpTag
				# print(tableTag)
				allATag = tableTag.find_all('a')
				if len(allATag) > 0: # table with no link doesn't count
					allBtLink = True
					for aTag in allATag:
						href = aTag.get('href')
						# print(href)
						if (href is not None) and ('baka-tsuki.org' not in href) and (not href.startswith('javascript:')): # href can be js link to collapse/expand
							allBtLink = False
				else:
					allBtLink = False
				if allBtLink:
					print('Removed the unwanted navigator (table of links to main page and other volumes) at the end of main text.')
					tableTag.decompose()

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# search for gallery images first
		for imgTag in soup.find_all('img'):
			imgSrc = urllib.parse.unquote(imgTag.get('src'))
			imgAlt = imgTag.get('alt')
			imgName = os.path.split(imgSrc)[1]

			if imgAlt and imgAlt.startswith('__galleryimage__'):
				# print('Found gallery image: %s' % imgName)
				imgInGallery = [ _[0] for _ in galleryImages ]
				if imgSrc not in imgInGallery:
					galleryImages.append((imgSrc, imgAlt))
				# still remove it from the text even if it's a duplicate
				outerTag = imgTag.parent
				imgTag.decompose()
				if len(outerTag.contents) == 0:
					outerTag.decompose()
		if len(galleryImages) > 0:
			plsWriteBack = True
			print('Found %d gallery images: %r.' % (len(galleryImages), [ _[0] for _ in galleryImages ]))
			plsRemoveEverythingAboveGallery = True
			for divTag in soup.find_all('div'): # eliminate gallary div and everything before it
				divID = divTag.get('id') # note that there can be multiple galleries
				if divID != None and divID.startswith('__gallery__'):
					if plsRemoveEverythingAboveGallery:
						aboveTheGallery = divTag.find_previous_siblings()
						if (len(aboveTheGallery) < 6):
							for tmpTag in aboveTheGallery:
								tmpTag.decompose()
							print('Cleaned stuff above gallery #%s.' % divID)
						else:
							print('Too much stuff above gallery #%s. Not gonna clean. Contents even before the gallery?' % divID)
					divTag.decompose()

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# wrapping img in svg
		imgWrappedInSvg = 0
		outOfGalleryImages = []
		print('Processing images in body text...')
		for imgTag in soup.find_all('img'):
			imgSrc = urllib.parse.unquote(imgTag.get('src'))
			imgWidth = imgTag.get('width')
			imgHeight = imgTag.get('height')
			imgName = os.path.split(imgSrc)[1]

			# remove the img from gallery if it's used in the body
			for tmp in galleryImages:
				tmpsrc, tmpalt = tmp
				if tmpsrc == imgSrc:
					outOfGalleryImages.append(tmp)
					galleryImages.remove(tmp)

			if imgTag.parent.name in headingLv:
				print('Skipped processing heading image: %s' % imgName)
				continue

			print('Processing image: %s' % imgName)

			if imgSrc.startswith('../'): imgSrc = imgSrc[3:]
			imgID = bk.href_to_id(imgSrc)

			if imgID: # image file exists
				svgNode = gumbo_bs4.parse(getSvgForImage(bk, imgID, dispWidth=imgWidth, dispHeight=imgHeight))
				# Deal with anchor wrapping around the original img tag
				# usually <p><a href="http://somewhere.com"><img src='blabla.jpg' alt='nothing' /></a></p>
				# copy <a> to inside <div>, outside <img> or <svg>. put svgNode outside <a> (and <p> if any)
				# if <a> contains nothing but the image, kill the original <a>
				if imgTag.parent.name == 'a':
					anchorTag = imgTag.parent
					targetHref = anchorTag.get('href')
					if targetHref:
						newATag = soup.new_tag('a')
						newATag['href'] = targetHref
						for tmpTag in svgNode.find_all(['svg', 'img']):
							tmpTag.wrap(newATag)
					imgTag.parent.insert_before(imgTag)
					if len(anchorTag.contents) == 0 or (len(anchorTag.contents) == 1 and str(anchorTag.contents[0]).strip() == ''):
						anchorTag.decompose()

				# if the parent tag is p, insert svgNode before p and delete img. svg is not allowed inside p or span.
				if imgTag.parent.name == 'p':
					imgTag.parent.insert_before(svgNode)
					outerTag = imgTag.parent
					imgTag.decompose()
					if len(outerTag.contents) == 0:
						outerTag.decompose()
				elif imgTag.parent.name == 'div' or imgTag.parent.name == 'body':
					imgTag.replace_with(svgNode)
				# sometimes img tag is wrapped inside more tag than one p, like b in Heavy Object V11C3P12
				# climb the tree until a usable place is found: directly under <body> or <div>, have <div> or <p> or <a> as siblings.
				# Insert svgNode before it. Decompose the branch if it's worthless
				else:
					topBranch = imgTag
					while not (topBranch.parent.name in ['div', 'body'] or len(topBranch.find_next_siblings(['div', 'p', 'a']) + topBranch.find_previous_siblings(['div', 'p'])) > 0):
						topBranch = topBranch.parent
					topBranch.insert_before(svgNode)
					outerTag = imgTag.parent
					imgTag.decompose()
					if len(outerTag.contents) == 0:
						outerTag.decompose()

				imgWrappedInSvg += 1
			else:
				print('Error: image file not found.')
		if imgWrappedInSvg > 0:
			plsWriteBack = True
			print('Wrapped %d images in SVG.' % imgWrappedInSvg)
		if len(outOfGalleryImages) > 0:
			plsWriteBack = True
			print('Removed %d images from the gallery because they\'re used in the body text: %r' % (len(outOfGalleryImages), [ _[0] for _ in outOfGalleryImages ]))

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# re-add attributes removed by BeautifulSoup for no reason
		errorsByBsCorrected = 0
		for svgTag in soup.find_all('svg'):
			if 'xmlns' not in svgTag or 'xmlns:xlink' not in svgTag:
				errorsByBsCorrected += 1
				svgTag['xmlns'] = "http://www.w3.org/2000/svg"
				svgTag['xmlns:xlink'] = "http://www.w3.org/1999/xlink"
				for imageTag in svgTag.find_all('image'):
					try:
						imageTag['xlink:href'] = imageTag['href']
						del imageTag['href']
					except:
						pass
		if errorsByBsCorrected > 0:
			plsWriteBack = True
			print('Corrected %d errors introduced by BeautifulSoup in svg/image tag.' % errorsByBsCorrected)

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# Clean up blank paragraphs next to headings and images.
		blankParagraphsToClean = []
		for lv in headingLv:
			for headingTag in soup.find_all(lv):
				for paragraph in headingTag.find_next_siblings('p'):
					if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0:
						blankParagraphsToClean.append(paragraph)
					else: break
				for paragraph in headingTag.find_previous_siblings('p'):
					if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0:
						blankParagraphsToClean.append(paragraph)
					else: break
		for imgTag in soup.find_all('img'):
			if imgTag.parent.name == 'p':
				for paragraph in imgTag.parent.find_next_siblings('p'):
					if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0:
						blankParagraphsToClean.append(paragraph)
					else: break
				for paragraph in imgTag.parent.find_previous_siblings('p'):
					if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0:
						blankParagraphsToClean.append(paragraph)
					else: break
		for divTag in soup.find_all('div'):
				for paragraph in divTag.find_next_siblings('p'):
					if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0:
						blankParagraphsToClean.append(paragraph)
					else: break
				for paragraph in divTag.find_previous_siblings('p'):
					if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0:
						blankParagraphsToClean.append(paragraph)
					else: break
				if len(divTag.contents) == 0:
					blankParagraphsToClean.append(divTag)
		for endTag in soup.body.contents[::-1]:
			if type(endTag) == sigil_bs4.element.Tag:
				if endTag.name == 'p' and endTag.get_text().strip() == '' and len(endTag.find_all('img')) == 0:
					blankParagraphsToClean.append(endTag)
				else: break
		for startTag in soup.body.contents:
			if type(startTag) == sigil_bs4.element.Tag:
				if startTag.name == 'p' and startTag.get_text().strip() == '' and len(startTag.find_all('img')) == 0:
					blankParagraphsToClean.append(startTag)
				else: break
		if len(blankParagraphsToClean) > 0:
			# print(blankParagraphsToClean)
			blankParagraphsToClean = removeDuplicateBs4Object(blankParagraphsToClean)
			for paragraph in blankParagraphsToClean:
				paragraph.decompose()
			print('Cleaned %d blank paragraphs next to headings and images.' % len(blankParagraphsToClean))
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# remove trash in head
		for styleTag in soup.find_all(['style', 'script', 'link', 'iframe']):
			styleTag.decompose()
			plsWriteBack = True
		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False
			print('Removed embedded style/script/iframe garbages.')

		for metaTag in soup.head.find_all('meta'):
			if (metaTag.get('charset') != None):
				print('Removing meta charset in head.')
				metaTag.decompose()
				plsWriteBack = True

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# link stylesheets
		cssList = ['../Styles/page_styles.css', '../Styles/stylesheet.css']
		for linkTag in soup.head.find_all('link'):
			if (linkTag.get('rel') == 'stylesheet'):
				href = linkTag.get('href')
				if (href in cssList):
					cssList.remove(href)
					print('Stylesheet %s already linked.' % href)
		for css in cssList:
			cssLinkTag = soup.new_tag('link', href=css, rel="stylesheet", type="text/css")
			soup.head.append(cssLinkTag)
			print('Linked stylesheet %s.' % css)
			plsWriteBack = True

		if plsWriteBack:
			html = soup.serialize_xhtml()
			soup = gumbo_bs4.parse(html)
			plsWriteBack = False

		# Sigil's prettifying function tends to add needless spaces in the midle of text - tag border
		# if the html has been already prettified by BeautifulSoup
		# It's better to not prettify it here
		html = soup.serialize_xhtml()

		# handle alternative readings which have been stripped by the ebook convert script
		html = re.sub('<span>\s*?<span>\s*?<span>(.*?)</span>\s*?</span>\s*?<span>(.*?)</span>\s*?</span>', altReadingReplace, html, flags=re.DOTALL)
		if altReadingCount > 0:
			print('Corrected %d alternative readings.' % altReadingCount)
			plsWriteBack = True

		# convert ruby tags in yukkuri-literature-service into baka-tsuki-like alternative reading
		# <ruby>Court Magician<rp>(</rp><rt>Civil Servant</rt><rp>)</rp></ruby>
		altReadingCount = 0
		html = re.sub('<ruby>(.*?)\s*<rp>\s*\(\s*</rp>\s*<rt>(.*?)</rt>\s*<rp>\s*\)\s*</rp>\s*</ruby>', altReadingReplaceRuby, html, flags=re.DOTALL)
		if altReadingCount > 0:
			print('Converted %d ruby furigana into Baka-Tsuki-like alternative reading(s).' % altReadingCount)

		mainText.append(html)

		if soup.title.string:
			bookTitle = soup.title.string.strip()

	print(' ')
	return mainText, galleryImages, bookTitle, suggestedFilenames
Example #10
0
def generateToC(bk, bookTitle, BookId):
	print('Generating Table of Contents.')

	def createNavPointTag(tocSoup, navPointID, playOrder, entryLabel, entrySrc, entryLevel):
		navPointTag = tocSoup.new_tag('navPoint')
		navPointTag['id'] = navPointID
		navPointTag['playOrder'] = playOrder

		textTag = tocSoup.new_tag('text')
		textTag.string = entryLabel

		navLabelTag = tocSoup.new_tag('navLabel')
		navLabelTag.append(textTag)

		contentTag = tocSoup.new_tag('content')
		contentTag['src'] = entrySrc

		levelTag = tocSoup.new_tag('level')
		levelTag.string = entryLevel

		navPointTag.append(navLabelTag)
		navPointTag.append(contentTag)
		navPointTag.append(levelTag)

		return navPointTag

	tocXml = '<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">  <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> <head> <meta name="dtb:uid" content="%s"/> <meta name="dtb:depth" content="2"/> <meta name="dtb:totalPageCount" content="0"/> <meta name="dtb:maxPageNumber" content="0"/> </head> <docTitle> <text>%s</text> </docTitle> <navMap> </navMap> </ncx>' % (BookId, bookTitle)
	tocSoup = sigil_bs4.BeautifulSoup(tocXml, 'xml')
	navMap = tocSoup.find('navMap')
	navID = 0

	headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8']
	headingLvN = {'h1':1, 'h2':2, 'h3':3, 'h4':4, 'h5':5, 'h6':6, 'h7':7, 'h8':8}
	lastTocEntry = None
	for textFileInfo in bk.text_iter():
		textID, textHref = textFileInfo

		html = bk.readfile(textID) # Read the section into html
		if not isinstance(html, text_type):	# If the section is not str then sets its type to 'utf-8'
			html = text_type(html, 'utf-8')
		soup = gumbo_bs4.parse(html)

		entryInThisFile = 0
		for headingTag in soup.find_all(headingLv):
			# all heading in body text files should have been given their id.
			# If one doesn't have its id, it's not in body text, so just ignore it.
			# Don't mind text files that don't have any entry. It's not an issue (Sigil's behavior)
			if headingTag.get('id'):
				entryLabel = headingTag.get_text()
				if entryInThisFile == 0: # first entry in the file should point to the beginning of the file (Sigil's behavior)
					entrySrc = textHref
				else:
					entrySrc = textHref + '#' + headingTag.get('id')
				entryLevel = headingTag.name
				entryLevelN = headingLvN[entryLevel]

				navID += 1
				navPointID = 'navPoint-%d' % navID
				playOrder = navID
				navPointTag = createNavPointTag(tocSoup, navPointID, playOrder, entryLabel, entrySrc, entryLevel)

				if not lastTocEntry:
					# first entry
					navMap.append(navPointTag)
				else:
					# climb the tree until you find a nav of higher level, or reach navMap
					parentCandidate = lastTocEntry
					parentCandidate_levelN = headingLvN[parentCandidate.find('level').string]
					while (parentCandidate.name != 'navMap' and (entryLevelN <= parentCandidate_levelN)):
						parentCandidate = parentCandidate.parent
						try:
							parentCandidate_levelN = headingLvN[parentCandidate.find('level').string]
						except:
							parentCandidate_levelN = 0
					parentCandidate.append(navPointTag)

				lastTocEntry = navPointTag
				entryInThisFile += 1

	# if no heading found, add a Start entry to the first text file
	if navID == 0:
		for textFileInfo in bk.text_iter():
			textID, textHref = textFileInfo
			navID = 1
			navPointID = 'navPoint-%d' % navID
			playOrder = navID
			entryLabel = 'Start'
			entrySrc = textHref
			entryLevel = 'h1'
			navPointTag = createNavPointTag(tocSoup, navPointID, playOrder, entryLabel, entrySrc, entryLevel)
			navMap.append(navPointTag)
			break

	# remove all level tag. it's only useful for building the tree. it's not supposed to exist in toc
	for levelTag in navMap.find_all('level'):
		levelTag.decompose()

	# also measure toc depth
	tocDepth = 0
	for navPointTag in navMap.find_all('navPoint'):
		thisDepth = 1
		parent = navPointTag.parent
		while parent.name != 'navMap':
			thisDepth += 1
			parent = parent.parent
		if thisDepth > tocDepth:
			tocDepth = thisDepth
	# set tocdepth
	for metaTag in tocSoup.find_all('meta'):
		if metaTag.get('name') == "dtb:depth":
			metaTag['content'] = str(tocDepth)

	# print('\n\n\n\n\n')
	# print(tocSoup)
	bk.writefile(bk.gettocid(), tocSoup.prettify())
Example #11
0
def parse_xhtml(bk, cssparser: CSSParser, css_collector: CSSAttributes,
                prefs: MutableMapping) -> XHTMLAttributes:
    """
    Parse all the xhtml files in the epub and gather classes, ids
    and fragment identifiers. Also, gather css classes and ids
    from <style> elements.
    """
    a = XHTMLAttributes()
    fragid_container_attrs = prefs[
        'fragid_container_attrs'] or a.fragid_container_attrs
    for xhtml_id, xhtml_href in bk.text_iter():
        filename = utils.href_to_basename(xhtml_href)
        try:
            soup = gumbo_bs4.parse(bk.readfile(xhtml_id))
        except Exception as E:
            raise XMLParsingError('Error in {}: {}'.format(filename, E))
        if prefs['parse_only_selected_files'] and xhtml_href not in prefs[
                'selected_files']:
            gather_only_fragid = True
        else:
            gather_only_fragid = False

        for elem in soup.find_all(True):
            # gather fragment identifiers, if present
            for attr in fragid_container_attrs:
                fragid = get_fragid(elem, attr)
                if fragid:
                    a.fragment_identifier.add(fragid)
            if gather_only_fragid:
                continue

            # tag 'style': gather all css classes and ids
            if elem.name == 'style':
                try:
                    style = elem.contents[0]
                except IndexError:
                    pass
                else:
                    cssparser.parse_style(style, css_collector, filename)
            # gather id value, if present
            try:
                id_ = elem['id']
            except KeyError:
                pass
            else:
                if id_ in a.id_values:
                    try:
                        a.info_id_values[id_][xhtml_href] += 1
                    except KeyError:
                        a.info_id_values[id_][xhtml_href] = 1
                else:
                    a.info_id_values[id_] = {xhtml_href: 1}
                    a.id_values.add(id_)
            # gather class names and textual value of class attribute, if present
            classes = elem.get('class', [])
            if isinstance(classes, str):
                classes = [classes]
            for class_ in classes:
                if class_ in a.class_names:
                    try:
                        a.info_class_names[class_][xhtml_href] += 1
                    except KeyError:
                        a.info_class_names[class_][xhtml_href] = 1
                else:
                    a.info_class_names[class_] = {xhtml_href: 1}
                    a.class_names.add(class_)
            if classes:
                try:
                    literal_class_value = re.search(r'class=([\'"])(.+?)\1',
                                                    str(elem)).group(2)
                except AttributeError:
                    pass
                else:
                    a.literal_class_values.add(literal_class_value)
    a.class_names.discard('')
    a.literal_class_values.discard('')
    return a
Example #12
0
        def cleanUpForTruyenFull():
            nonlocal soup

            headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
            headingTags = soup.body.find_all(headingLv)
            divTags = soup.body.find_all(
                "div", "chapter-c"
            )  # WebToEpub note: you need <div class="col-xs-12">

            if len(divTags) > 0:
                textNode = divTags[0].extract()

                headerNode = None
                if len(headingTags) > 0:
                    headerNode = headingTags[0].extract()
                    del headerNode["class"]
                    del headerNode["style"]
                    headerNode.string = headerNode.get_text().strip()

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                for node in soup.body.contents:
                    node.extract()

                if headerNode:
                    soup.body.append(headerNode)
                soup.body.append(textNode)

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                # print(len(soup.body.find_all(['a', 'span', 'p'], { 'style':"color:white;font-size:1px;"})))
                # print((soup.body.find_all(lambda tag:tag.has_attr('style') and 'font-size:1px' in tag['style'])))
                # print(len(soup.body.find_all(['a', 'span', 'p'])))
                for node in soup.body.find_all(lambda tag: tag.has_attr(
                        'style') and 'font-size:1px' in tag['style']):
                    node.decompose()

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                textNode = soup.body.find("div", "chapter-c")
                # unwrapping the div in a preferrable way
                newTextNode = soup.new_tag('div')
                newTextNode['class'] = "chapter-c"
                # removeMe = []
                previousP = None
                for child in textNode.contents:
                    if type(child) == sigil_bs4.element.NavigableString:
                        # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking
                        if str(child).strip() != '':
                            if previousP:
                                previousP.append(copy.copy(child))
                            else:
                                child = copy.copy(child)
                                newTextNode.append(child)
                                previousP = child.wrap(soup.new_tag('p'))
                        else:
                            newTextNode.append(
                                copy.copy(child))  # yes, copy even blank space
                    elif type(child) == sigil_bs4.element.Tag:
                        if child.name == 'br':
                            previousP = None
                        elif child.name not in tagsNotAllowedInP:
                            # for these, check if they have some contents. skip copying if no
                            if (len(child.get_text().strip()) > 0
                                    or len(child.find_all(True)) > 0
                                ) or child.has_attr('id') or child.has_attr(
                                    'name'):
                                if previousP:
                                    previousP.append(copy.copy(child))
                                else:
                                    child = copy.copy(child)
                                    newTextNode.append(child)
                                    previousP = child.wrap(soup.new_tag('p'))
                        else:
                            # stuff not allowed in <p>
                            child = copy.copy(child)
                            newTextNode.append(child)
                textNode.replace_with(newTextNode)

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                textNode = soup.body.find("div", "chapter-c")
                textNode.unwrap()
Example #13
0
 def reloadSoup():
     nonlocal soup, textContents
     if soup:
         textContents = soup.serialize_xhtml()
     soup = gumbo_bs4.parse(textContents)
Example #14
0
def run(bk):
    # get python plugin path
    global plugin_path
    plugin_path = os.path.join(bk._w.plugin_dir, plugin_name)

    for (textID, textHref) in bk.text_iter():
        if os.path.split(textHref)[1] in [
                'Cover.xhtml', 'cover.xhtml', 'titlepage.xhtml',
                'Section0001.xhtml', 'Illustrations.xhtml'
        ]:  # main text file is anything but these
            continue
        print('\nProcessing text file: %s' % textHref)

        textContents = bk.readfile(
            textID)  # Read the section into textContents
        if not isinstance(
                textContents, text_type
        ):  # If the section is not str then sets its type to 'utf-8'
            textContents = text_type(textContents, 'utf-8')

        soup = gumbo_bs4.parse(textContents)

        def reloadSoup():
            nonlocal soup, textContents
            if soup:
                textContents = soup.serialize_xhtml()
            soup = gumbo_bs4.parse(textContents)

        def cleanUpForWordpress():
            nonlocal soup
            # add cleanups for wordpress-based epub
            # - get <h1> from <header class="entry-header">blablah</header> inside <body>
            headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
            headerNode = soup.body.find("header")
            if headerNode:
                headingTags = headerNode.find_all(headingLv)
                if len(headingTags) > 0:
                    del headingTags[0]["class"]
                    del headingTags[0]["style"]
                    headerNode.replace_with(headingTags[0])
                    # clean <body> too
                    del soup.body['class']
                    del soup.body['style']
            # - unwrap <div class="entry-content">
            # - kill <div class="entry-meta">
            divClassUnwrapMe = [
                "entry-content", "entry-the-content", "post-entry"
            ]
            divClassRemoveMe = [
                "entry-meta", "screen-reader-text", "sharedaddy", "wc-comment",
                "wc-blog-", "comments"
            ]
            deleteMe = []
            for node in soup.body.find_all('div'):
                if node.has_attr('class'):
                    if stringContainsAny(node.get('class'), divClassUnwrapMe):
                        node.unwrap()
                    elif stringContainsAny(node.get('class'),
                                           divClassRemoveMe):
                        # node.decompose()
                        deleteMe.append(node)
            for node in deleteMe:
                node.decompose()
            # - delete <footer>
            for node in soup.find_all(['footer']):
                node.decompose()

            reloadSoup()

        def cleanUpForTruyenFull():
            nonlocal soup

            headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
            headingTags = soup.body.find_all(headingLv)
            divTags = soup.body.find_all(
                "div", "chapter-c"
            )  # WebToEpub note: you need <div class="col-xs-12">

            if len(divTags) > 0:
                textNode = divTags[0].extract()

                headerNode = None
                if len(headingTags) > 0:
                    headerNode = headingTags[0].extract()
                    del headerNode["class"]
                    del headerNode["style"]
                    headerNode.string = headerNode.get_text().strip()

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                for node in soup.body.contents:
                    node.extract()

                if headerNode:
                    soup.body.append(headerNode)
                soup.body.append(textNode)

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                # print(len(soup.body.find_all(['a', 'span', 'p'], { 'style':"color:white;font-size:1px;"})))
                # print((soup.body.find_all(lambda tag:tag.has_attr('style') and 'font-size:1px' in tag['style'])))
                # print(len(soup.body.find_all(['a', 'span', 'p'])))
                for node in soup.body.find_all(lambda tag: tag.has_attr(
                        'style') and 'font-size:1px' in tag['style']):
                    node.decompose()

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                textNode = soup.body.find("div", "chapter-c")
                # unwrapping the div in a preferrable way
                newTextNode = soup.new_tag('div')
                newTextNode['class'] = "chapter-c"
                # removeMe = []
                previousP = None
                for child in textNode.contents:
                    if type(child) == sigil_bs4.element.NavigableString:
                        # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking
                        if str(child).strip() != '':
                            if previousP:
                                previousP.append(copy.copy(child))
                            else:
                                child = copy.copy(child)
                                newTextNode.append(child)
                                previousP = child.wrap(soup.new_tag('p'))
                        else:
                            newTextNode.append(
                                copy.copy(child))  # yes, copy even blank space
                    elif type(child) == sigil_bs4.element.Tag:
                        if child.name == 'br':
                            previousP = None
                        elif child.name not in tagsNotAllowedInP:
                            # for these, check if they have some contents. skip copying if no
                            if (len(child.get_text().strip()) > 0
                                    or len(child.find_all(True)) > 0
                                ) or child.has_attr('id') or child.has_attr(
                                    'name'):
                                if previousP:
                                    previousP.append(copy.copy(child))
                                else:
                                    child = copy.copy(child)
                                    newTextNode.append(child)
                                    previousP = child.wrap(soup.new_tag('p'))
                        else:
                            # stuff not allowed in <p>
                            child = copy.copy(child)
                            newTextNode.append(child)
                textNode.replace_with(newTextNode)

                html = soup.serialize_xhtml()
                soup = gumbo_bs4.parse(html)
                textNode = soup.body.find("div", "chapter-c")
                textNode.unwrap()

        def splitNodesIntoP(pNodes):
            nonlocal soup

            # level 1:
            #    try to split <p>line 1<br/><br/>line 2<img alt='' src='image.jpg'/>line3</p>
            #    into <p>line 1</p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p>
            #    Remember to copy style and class. ID goes to the first p
            # level 2:
            #    into <p>line 1</p> <p></p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p>
            # level x:
            #    try to handle <br/> nested inside something else like <p>line 1 <i>italic text<br/>line 2 in italic</i></p>
            #    into <p>line 1 <i>italic text</i></p> <p><i>line 2 in italic</i></p>
            # current at level 2, but empty lines are removed at later stage so it doesn't even matter
            # TODO: copy style, class, id
            unwrapUsID = []
            for textNode in pNodes:
                # for now, we put all new p in a container (and unwrap it later)
                newTextNode = soup.new_tag('div')
                newTextNode_id = 'id-' + str(uuid.uuid4())
                newTextNode['id'] = newTextNode_id
                unwrapUsID.append(newTextNode_id)
                # removeMe = []
                previousP = None
                lastChildWasBr = False
                for child in textNode.contents:
                    if type(child) == sigil_bs4.element.NavigableString:
                        lastChildWasBr = False
                        # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking
                        if str(child).strip() != '':
                            if previousP:
                                previousP.append(copy.copy(child))
                            else:
                                child = copy.copy(child)
                                newTextNode.append(child)
                                previousP = child.wrap(soup.new_tag('p'))
                        else:
                            newTextNode.append(
                                copy.copy(child))  # yes, copy even blank space

                    elif type(child) == sigil_bs4.element.Tag:
                        if child.name == 'br':
                            if lastChildWasBr:
                                newTextNode.append(soup.new_tag('p'))
                            lastChildWasBr = True
                            previousP = None

                        elif child.name == 'img':
                            child = copy.copy(child)
                            newTextNode.append(child)
                            tmpNode = child.wrap(soup.new_tag('div'))
                            tmpNode['class'] = "svg_outer svg_inner"
                            lastChildWasBr = False
                            previousP = None

                        elif child.name not in tagsNotAllowedInP:
                            lastChildWasBr = False
                            # for these, check if they have some contents. skip copying if no
                            if (len(child.get_text().strip()) > 0
                                    or len(child.find_all(True)) > 0
                                ) or child.has_attr('id') or child.has_attr(
                                    'name'):
                                if previousP:
                                    previousP.append(copy.copy(child))
                                else:
                                    child = copy.copy(child)
                                    newTextNode.append(child)
                                    previousP = child.wrap(soup.new_tag('p'))
                        else:
                            lastChildWasBr = False
                            # stuff not allowed in <p>
                            child = copy.copy(child)
                            newTextNode.append(child)
                textNode.replace_with(newTextNode)

            html = soup.serialize_xhtml()
            soup = gumbo_bs4.parse(html)

            for node in soup.body.find_all('div'):
                if node.get('id') in unwrapUsID:
                    node.unwrap()

            html = soup.serialize_xhtml()
            soup = gumbo_bs4.parse(html)

        def splitTagtoP(wantedTag):
            nonlocal soup
            pNodes = soup.body.find_all(wantedTag)
            splitNodesIntoP(pNodes)

        def splitPtoP():
            nonlocal soup
            splitTagtoP("p")

        def easyClean1():
            nonlocal soup

            plsWriteBack = False

            # delete all these nodes
            for node in soup.find_all(['style', 'meta', 'input', 'button']):
                node.decompose()
                plsWriteBack = True

            # unwrap all these nodes
            for node in soup.find_all(['font']):
                node.unwrap()
                plsWriteBack = True

            # convert name attribute into id in <a> tag
            tagsFixedCount = 0
            for anchorTag in soup.find_all(['a']):
                if anchorTag.has_attr('name'):
                    anchorTag['id'] = anchorTag['name']
                    del anchorTag['name']
                    tagsFixedCount += 1
            if tagsFixedCount > 0:
                print(
                    'Converted %d `name` attribute into `id` in <a> tag(s).' %
                    tagsFixedCount)
                plsWriteBack = True

            # remove lang, link, vlink attr, mso or calibre class
            for node in soup.find_all(True):
                del node['lang']
                del node['link']
                del node['vlink']
                class_attr = node.get('class')
                if class_attr:
                    try:
                        classes = class_attr.split(' ')
                    except:
                        classes = class_attr
                    new_classes = []
                    for cl in classes:
                        if not (cl.startswith('Mso') or cl.startswith('mso')
                                or cl.startswith('calibre')):
                            new_classes.append(cl)
                    if len(new_classes) > 0:
                        node['class'] = ' '.join(new_classes)
                    else:
                        del node['class']

                    plsWriteBack = True

            if plsWriteBack:
                reloadSoup()

        def easyClean2():
            nonlocal soup
            plsWriteBack = False

            # remove all data-* attributes from tags
            tagsFixedCount = 0
            for buggyTag in soup.find_all(True):
                attrDel = 0
                for attr in list(buggyTag.attrs.keys()):
                    if attr.startswith('data-'):
                        del buggyTag[attr]
                        attrDel += 1
                    elif attr == 'itemprop':
                        del buggyTag[attr]
                        attrDel += 1
                    elif attr == 'target':
                        del buggyTag[attr]
                        attrDel += 1
                if attrDel > 0:
                    tagsFixedCount += 1

            if tagsFixedCount > 0:
                reloadSoup()
                print(
                    'Removed itemprop/data-*/target attribute(s) from %d tag(s).'
                    % tagsFixedCount)

            # remove  align/noshade/size/width attributes from <hr> tags
            tagsFixedCount = 0
            for buggyTag in soup.find_all('hr'):
                attrDel = 0
                for attr in list(buggyTag.attrs.keys()):
                    if attr in ['align', 'noshade', 'size', 'width']:
                        del buggyTag[attr]
                        attrDel += 1
                if attrDel > 0:
                    tagsFixedCount += 1
            if tagsFixedCount > 0:
                reloadSoup()
                print(
                    'Removed all deprecated attributes from %d <hr> tag(s).' %
                    tagsFixedCount)

            # handle align attribute in p, div, span
            tagsFixedCount = 0
            for pdivspanTag in soup.find_all(True):
                alignAttr = pdivspanTag.get('align')
                if alignAttr != None:
                    styleAttr = pdivspanTag.get('style')
                    if styleAttr:
                        pdivspanTag[
                            'style'] = 'text-align: %s; ' % alignAttr + styleAttr
                    else:
                        pdivspanTag['style'] = 'text-align: %s;' % alignAttr
                    del pdivspanTag['align']
                    tagsFixedCount += 1
            if tagsFixedCount > 0:
                reloadSoup()
                print(
                    'Converted align attribute in %d p/div/span tag(s) into css style.'
                    % tagsFixedCount)

            # remove all links except for stylesheet ones
            for node in soup.find_all(['link', 'meta']):
                if not node.get('rel') == "stylesheet":
                    node.decompose()
                    plsWriteBack = True

            # Ziru’s Musings ads or placeholders for ads
            for node in soup.findAll(
                    'div',
                {'class': lambda x: x and ('ezoic-adpicker-ad' in x.split())}):
                node.decompose()
                plsWriteBack = True

            if plsWriteBack:
                reloadSoup()

        def removeAllStyleAttr():
            nonlocal soup
            # hard-core clean up. strip all style
            # generally should not be used
            for node in soup.find_all(True):
                del node['style']
            reloadSoup()

        def removeEmptyStyleAttr():
            nonlocal soup
            plsWriteBack = False
            for node in soup.find_all(True):
                if node.has_attr('style'):
                    styleAttr = node['style'].strip()
                    if styleAttr:
                        node['style'] = styleAttr
                    else:
                        del node['style']
                        plsWriteBack = True
            if plsWriteBack:
                reloadSoup()

        def stripHeaderFormattings():
            nonlocal soup
            # strip all formatings from headings as BTE-GEN does
            headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
            headingStrippedCount = 0
            for lv in headingLv:
                for headingTag in soup.find_all(lv):
                    if len(headingTag.find_all('img')) == 0 and (
                            len(headingTag.find_all(True)) > 0
                            or headingTag.get('style')):
                        headingTag.string = headingTag.get_text().strip()
                        del headingTag['style']
                        headingStrippedCount += 1
            if headingStrippedCount > 0:
                reloadSoup()
                print(
                    'Stripped formatings from %d headings to match BTE-GEN\'s behavior.'
                    % headingStrippedCount)

        def removedNoDisplayDiv():
            nonlocal soup

            # remove all <div style="display:none;">
            modifiedTagCount = 0
            removeMe = []

            for divTag in soup.find_all('div'):
                if divTag and divTag.get("style") and 'display:none' in re.sub(
                        "\s", "", divTag.get("style")):
                    removeMe.append(divTag)
                    modifiedTagCount += 1

            if modifiedTagCount > 0:
                for divTag in removeMe:
                    divTag.decompose()
                print('Removed %d <div style="display:none;"> tags.' %
                      modifiedTagCount)
                reloadSoup()

        def fixBadIBUusage():
            nonlocal soup
            # handle the invalid usage of <i> tags in HakoMari vol 2 may 2. This is due to a major error in the source page, but it can't be helped.
            # also stuff here https://baka-tsuki.org/project/index.php?title=User_talk:Dreamer2908
            # ref http://www.w3schools.com/html/html_formatting.asp
            headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
            tagsFixedCount = 0
            tag2Css = {
                'b': 'font-weight: bold;',
                'strong': 'font-weight: bold;',
                'i': 'font-style: italic;',
                'em': 'font-style: italic;',
                'big': 'font-size: large',
                'small': 'font-size: smaller',
                'mark': 'background-color: yellow; color: black;',
                's': 'text-decoration: line-through;',
                'strike': 'text-decoration: line-through;',
                'del': 'text-decoration: line-through;',
                'ins': 'text-decoration: underline;',
                'sub': 'vertical-align: sub; font-size: smaller;',
                'sup': 'vertical-align: super; font-size: smaller;',
                'u': 'text-decoration: underline;',
            }
            for iTag in soup.find_all([
                    'b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's',
                    'strike', 'del', 'ins', 'sub', 'sup', 'u'
            ]):
                illegalChild = iTag.find_all([
                    'p', 'div', 'table', 'blockquote', 'pre', 'caption', 'dl',
                    'hr', 'section', 'ul', 'ol'
                ] + headingLv)
                if len(illegalChild) > 0:
                    tagsFixedCount += 1
                    for child in iTag.children:
                        if type(child) == sigil_bs4.element.NavigableString:
                            # a lot of unwanted `<p><i> </i></p>` line will be created if you wrap everything without checking
                            if str(child).strip() != '':
                                wrapper = child.wrap(soup.new_tag(iTag.name))
                                wrapper.wrap(soup.new_tag('p'))
                        elif child.name == 'p':
                            for grandChild in child.children:
                                if type(grandChild) == sigil_bs4.element.Tag:
                                    if grandChild.name == iTag.name:
                                        grandChild.unwrap(
                                        )  # remove italic from italic text
                                    else:
                                        grandChild.wrap(soup.new_tag(
                                            iTag.name))
                                else:
                                    grandChild.wrap(soup.new_tag(iTag.name))
                        elif child.name not in headingLv:  # skip styling headings
                            styleAttr = child.get('style')
                            if styleAttr:
                                child['style'] = tag2Css[iTag.name] + styleAttr
                            else:
                                child['style'] = tag2Css[iTag.name]
                    iTag.unwrap()

            if tagsFixedCount > 0:
                reloadSoup()
                print(
                    'Fixed %d range of invalid usage of text formatting tags (i/b/u/etc.)'
                    % tagsFixedCount)

        def convertPossibleDivToP():
            nonlocal soup
            # convert div into p if possible
            modifiedTagCount = 0
            for divTag in soup.find_all('div'):
                if canBeConvertedIntoP(divTag):
                    divTag.name = 'p'
                    modifiedTagCount += 1
                # elif not (divTag.get('style') or divTag.get('id') or divTag.get('class')):
                # 	divTag.unwrap()
            if modifiedTagCount > 0:
                reloadSoup()
                print('Converted %d div tags into p.' % modifiedTagCount)

        def unwarpSingleBigDiv():
            nonlocal soup

            # unwrap the big single div holding all contents
            bigDivCount = 0
            for node in soup.body.contents:
                if (type(node) == sigil_bs4.element.Tag):
                    if (node.name == 'div'):
                        bigDivCount += 1
                    else:
                        bigDivCount += 1000
            if bigDivCount == 1:
                soup.body.div.unwrap()
                reloadSoup()
                print('Unwrapped the big single div holding all contents.')

        def unwarpPossibleDiv_basic():
            nonlocal soup

            modifiedTagCount = 0
            for divTag in soup.find_all('div'):
                if canBeUnwrap(divTag):
                    divTag.unwrap()
                    modifiedTagCount += 1
            if modifiedTagCount > 0:
                reloadSoup()
                print('Unwrapped %d div tags.' % modifiedTagCount)

        def unwarpPossibleDiv_experimental():
            nonlocal soup

            modifiedTagCount = 0
            pNodes = []
            for divTag in soup.find_all('div'):
                if canBeUnwrap(divTag):
                    pNodes.append(divTag)
                    modifiedTagCount += 1

            splitNodesIntoP(pNodes)

            if modifiedTagCount > 0:
                reloadSoup()
                print('Unwrapped %d div tags.' % modifiedTagCount)

        # remove empty span
        # do this before wrap stray tags
        def removeEmptySpan():
            def removeEmptySpanSub(spanTag):
                if spanTag.parent is None:
                    return False

                modified = False
                if containChildTags(spanTag, ['span']):
                    for subSpanTag in spanTag.find_all(['span']):
                        changed = removeEmptySpanSub(subSpanTag)
                        if changed:
                            modified = True

                if spanTag.get_text().strip() == '' and not containChildTags(
                        spanTag,
                    ['span', 'img'
                     ]):  # if it still has some span, don't decompose it
                    spanTag.unwrap()
                    modified = True
                elif not (spanTag.get('style') or
                          (spanTag.get('id')
                           and spanTag.get('id').startswith('_Toc'))):
                    spanTag.unwrap()
                    modified = True
                elif spanTag.get('style') and (
                        spanTag.get('style').strip() == "font-weight: 400;"
                        or spanTag.get('style').strip() == ""):
                    spanTag.unwrap()
                    modified = True

                return modified

            nonlocal soup
            plsWriteBack = False

            for spanTag in soup.find_all(['span']):
                modified = removeEmptySpanSub(spanTag)
                if modified:
                    plsWriteBack = True

            if plsWriteBack:
                reloadSoup()

        def wrapStrayText_basic():
            nonlocal soup

            # wrap stray (direct decendant of body) <br>/<span>/<a>, text formatting tags and text in <p> (krytykal/skythewood/imoutolicious source)
            phantomWrapped = 0
            removeMe = []

            for child in soup.body.contents:
                if type(child) == sigil_bs4.element.NavigableString:
                    # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking
                    if str(child).strip() != '':
                        child.wrap(soup.new_tag(
                            'p'))['class'] = 'baka_epub_stray_elements'
                        phantomWrapped += 1
                    else:
                        child.replace_with(
                            '\n'
                        )  # eliminate blank stray texts that aren't newline or true white spaces

                elif type(child) == sigil_bs4.element.Tag:
                    if child.name in ['br', 'a']:
                        child.wrap(soup.new_tag(
                            'p'))['class'] = 'baka_epub_stray_elements'
                        phantomWrapped += 1
                    elif child.name in [
                            'span', 'b', 'strong', 'i', 'em', 'big', 'small',
                            'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup',
                            'u'
                    ]:
                        # for these, check if they have some contents. remove if no
                        if (len(child.get_text().strip()) > 0
                                or len(child.find_all(True)) > 0):
                            child.wrap(soup.new_tag(
                                'p'))['class'] = 'baka_epub_stray_elements'
                        else:
                            removeMe.append(child)
                        phantomWrapped += 1

            if phantomWrapped > 0:
                for element in removeMe:
                    element.decompose()
                reloadSoup()
                print(
                    'Wrapped %d stray <br>/<span>/<a>, text formatting tags and texts in <p>.'
                    % phantomWrapped)

        def wrapStrayText_experimental():
            nonlocal soup

            splitNodesIntoP((soup.body, ))

        def removeEmptyP():
            nonlocal soup

            plsWriteBack = False
            for spanTag in soup.find_all(['p']):
                # remove empty p
                if spanTag.get_text().strip() == '' and len(
                        spanTag.find_all(['img'])) == 0:
                    spanTag.decompose()
                    plsWriteBack = True

            if plsWriteBack:
                reloadSoup()

        cleanUpForWordpress()
        cleanUpForTruyenFull()
        unwarpSingleBigDiv()

        easyClean1()
        easyClean2()
        removeEmptyStyleAttr()
        stripHeaderFormattings()

        fixBadIBUusage()

        removedNoDisplayDiv()
        convertPossibleDivToP()
        unwarpPossibleDiv_experimental()

        removeEmptySpan()
        wrapStrayText_experimental()

        textContents = soup.serialize_xhtml()

        # strip all comments
        textContents = re.sub('<!--(.*?)-->',
                              '',
                              textContents,
                              flags=re.DOTALL)

        bk.writefile(textID, textContents)

    print('Done.')
    return 0
Example #15
0
        def splitNodesIntoP(pNodes):
            nonlocal soup

            # level 1:
            #    try to split <p>line 1<br/><br/>line 2<img alt='' src='image.jpg'/>line3</p>
            #    into <p>line 1</p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p>
            #    Remember to copy style and class. ID goes to the first p
            # level 2:
            #    into <p>line 1</p> <p></p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p>
            # level x:
            #    try to handle <br/> nested inside something else like <p>line 1 <i>italic text<br/>line 2 in italic</i></p>
            #    into <p>line 1 <i>italic text</i></p> <p><i>line 2 in italic</i></p>
            # current at level 2, but empty lines are removed at later stage so it doesn't even matter
            # TODO: copy style, class, id
            unwrapUsID = []
            for textNode in pNodes:
                # for now, we put all new p in a container (and unwrap it later)
                newTextNode = soup.new_tag('div')
                newTextNode_id = 'id-' + str(uuid.uuid4())
                newTextNode['id'] = newTextNode_id
                unwrapUsID.append(newTextNode_id)
                # removeMe = []
                previousP = None
                lastChildWasBr = False
                for child in textNode.contents:
                    if type(child) == sigil_bs4.element.NavigableString:
                        lastChildWasBr = False
                        # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking
                        if str(child).strip() != '':
                            if previousP:
                                previousP.append(copy.copy(child))
                            else:
                                child = copy.copy(child)
                                newTextNode.append(child)
                                previousP = child.wrap(soup.new_tag('p'))
                        else:
                            newTextNode.append(
                                copy.copy(child))  # yes, copy even blank space

                    elif type(child) == sigil_bs4.element.Tag:
                        if child.name == 'br':
                            if lastChildWasBr:
                                newTextNode.append(soup.new_tag('p'))
                            lastChildWasBr = True
                            previousP = None

                        elif child.name == 'img':
                            child = copy.copy(child)
                            newTextNode.append(child)
                            tmpNode = child.wrap(soup.new_tag('div'))
                            tmpNode['class'] = "svg_outer svg_inner"
                            lastChildWasBr = False
                            previousP = None

                        elif child.name not in tagsNotAllowedInP:
                            lastChildWasBr = False
                            # for these, check if they have some contents. skip copying if no
                            if (len(child.get_text().strip()) > 0
                                    or len(child.find_all(True)) > 0
                                ) or child.has_attr('id') or child.has_attr(
                                    'name'):
                                if previousP:
                                    previousP.append(copy.copy(child))
                                else:
                                    child = copy.copy(child)
                                    newTextNode.append(child)
                                    previousP = child.wrap(soup.new_tag('p'))
                        else:
                            lastChildWasBr = False
                            # stuff not allowed in <p>
                            child = copy.copy(child)
                            newTextNode.append(child)
                textNode.replace_with(newTextNode)

            html = soup.serialize_xhtml()
            soup = gumbo_bs4.parse(html)

            for node in soup.body.find_all('div'):
                if node.get('id') in unwrapUsID:
                    node.unwrap()

            html = soup.serialize_xhtml()
            soup = gumbo_bs4.parse(html)