コード例 #1
0
    def smarten_punctuation(self):
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.get_html_names():
            self.log.info("Smartening punctuation for file {0}".format(name))
            html = self.get_raw(name)
            html = html.encode("UTF-8")

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)
            # Smarten punctuation
            html = smartyPants(html)
            # Ellipsis to HTML entity
            html = re.sub(ur'(?u)(?<=\w)\s?(\.\s+?){2}\.', '&hellip;', html, flags=re.UNICODE | re.MULTILINE)
            # Double-dash and unicode char code to em-dash
            html = string.replace(html, '---', ' &#x2013; ')
            html = string.replace(html, u"\x97", ' &#x2013; ')
            html = string.replace(html, '--', ' &#x2014; ')
            html = string.replace(html, u"\u2014", ' &#x2014; ')
            html = string.replace(html, u"\u2013", ' &#x2013; ')

            # Fix comment nodes that got mangled
            html = string.replace(html, u'<! &#x2014; ', u'<!-- ')
            html = string.replace(html, u' &#x2014; >', u' -->')

            # Remove Unicode replacement characters
            html = string.replace(html, u"\uFFFD", "")

            self.dirty(name)
        self.flush_cache()
コード例 #2
0
    def smarten_punctuation(self):
        """Convert standard punctuation to "smart" punctuation."""
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.html_names():
            self.log.info("Smartening punctuation for file {0}".format(name))
            html = self.get_raw(name, force_unicode=True)
            if html is None:
                continue

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)

            # Smarten punctuation
            html = smartyPants(html)

            # Ellipsis to HTML entity
            html = ELLIPSIS_RE.sub("&hellip;", html)

            # Double-dash and unicode char code to em-dash
            html = string.replace(html, "---", " &#x2013; ")
            html = string.replace(html, "\x97", " &#x2013; ")
            html = string.replace(html, "\u2013", " &#x2013; ")
            html = string.replace(html, "--", " &#x2014; ")
            html = string.replace(html, "\u2014", " &#x2014; ")

            # Fix comment nodes that got mangled
            html = string.replace(html, "<! &#x2014; ", "<!-- ")
            html = string.replace(html, " &#x2014; >", " -->")

            self.dirty(name)
        self.flush_cache()
コード例 #3
0
    def smarten_punctuation(self):
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.get_html_names():
            self.log.info("Smartening punctuation for file {0}".format(name))
            html = self.get_raw(name)
            html = html.encode("UTF-8")

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)
            # Smarten punctuation
            html = smartyPants(html)
            # Ellipsis to HTML entity
            html = re.sub(ur'(?u)(?<=\w)\s?(\.\s+?){2}\.',
                          '&hellip;',
                          html,
                          flags=re.UNICODE | re.MULTILINE)
            # Double-dash and unicode char code to em-dash
            html = string.replace(html, '---', ' &#x2013; ')
            html = string.replace(html, u"\x97", ' &#x2013; ')
            html = string.replace(html, '--', ' &#x2014; ')
            html = string.replace(html, u"\u2014", ' &#x2014; ')
            html = string.replace(html, u"\u2013", ' &#x2013; ')

            # Fix comment nodes that got mangled
            html = string.replace(html, u'<! &#x2014; ', u'<!-- ')
            html = string.replace(html, u' &#x2014; >', u' -->')

            # Remove Unicode replacement characters
            html = string.replace(html, u"\uFFFD", "")

            self.dirty(name)
        self.flush_cache()
コード例 #4
0
    def __smarten_punctuation_impl(self, name):  # type: (str) -> None
        """Convert standard punctuation to "smart" punctuation."""
        preprocessor = HeuristicProcessor(log=self.log)

        self.log.debug("Smartening punctuation for file {0}".format(name))
        html = self.raw_data(name, decode=True, normalize_to_nfc=True)
        if html is None:
            raise Exception(
                _(  # noqa: F821 - _ is defined in globals by calibre
                    "No HTML content in file {0}").format(name))

        # Fix non-breaking space indents
        html = preprocessor.fix_nbsp_indents(html)

        # Smarten punctuation
        html = smartyPants(html)

        # Ellipsis to HTML entity
        html = ELLIPSIS_RE.sub("&hellip;", html)

        # Double-dash and unicode char code to em-dash
        html = html.replace("---", " &#x2013; ")
        html = html.replace("\x97", " &#x2013; ")
        html = html.replace("\u2013", " &#x2013; ")
        html = html.replace("--", " &#x2014; ")
        html = html.replace("\u2014", " &#x2014; ")

        # Fix comment nodes that got mangled
        html = html.replace("<! &#x2014; ", "<!-- ")
        html = html.replace(" &#x2014; >", " -->")

        self.replace(name, self.parse_xhtml(html))
        self.flush_cache()
コード例 #5
0
    def smarten_punctuation(self):
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.get_html_names():
            self.log.debug("Smartening punctuation for file {0}".format(name))
            html = self.get_raw(name)
            html = html.encode("UTF-8")

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)
            # Smarten punctuation
            html = smartyPants(html)
            # Ellipsis to HTML entity
            html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '&hellip;', html)
            # Double-dash and unicode char code to em-dash
            html = string.replace(html, '---', ' &#x2013; ')
            html = string.replace(html, u"\x97", ' &#x2013; ')
            html = string.replace(html, '--', ' &#x2014; ')
            html = string.replace(html, u"\u2014", ' &#x2014; ')
            html = string.replace(html, u"\u2013", ' &#x2013; ')
            html = string.replace(html, u"...", "&#x2026;")

            # Remove Unicode replacement characters
            html = string.replace(html, u"\uFFFD", "")

            self.set(name, html)
コード例 #6
0
	def smarten_punctuation(self):
		preprocessor = HeuristicProcessor(log = self.log)

		for name in self.get_html_names():
			html = self.get_raw(name)
			html = html.encode("UTF-8")

			# Fix non-breaking space indents
			html = preprocessor.fix_nbsp_indents(html)
			# Smarten punctuation
			html = smartyPants(html)
			# Ellipsis to HTML entity
			html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '&hellip;', html)
			# Double-dash and unicode char code to em-dash
			html = string.replace(html, '---', ' &#x2013; ')
			html = string.replace(html, u"\x97", ' &#x2013; ')
			html = string.replace(html, '--', ' &#x2014; ')
			html = string.replace(html, u"\u2014", ' &#x2014; ')
			html = string.replace(html, u"\u2013", ' &#x2013; ')
			html = string.replace(html, u"...", "&#x2026;")

			# Remove Unicode replacement characters
			html = string.replace(html, u"\uFFFD", "")

			self.set(name, html)
コード例 #7
0
    def smarten_punctuation(self):
        """Convert standard punctuation to "smart" punctuation."""
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.html_names():
            self.log.debug("Smartening punctuation for file {0}".format(name))
            html = self.raw_data(name, decode=True, normalize_to_nfc=True)
            if html is None:
                continue

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)

            # Smarten punctuation
            html = smartyPants(html)

            # Ellipsis to HTML entity
            html = ELLIPSIS_RE.sub("&hellip;", html)

            # Double-dash and unicode char code to em-dash
            html = string.replace(html, "---", " &#x2013; ")
            html = string.replace(html, "\x97", " &#x2013; ")
            html = string.replace(html, "\u2013", " &#x2013; ")
            html = string.replace(html, "--", " &#x2014; ")
            html = string.replace(html, "\u2014", " &#x2014; ")

            # Fix comment nodes that got mangled
            html = string.replace(html, "<! &#x2014; ", "<!-- ")
            html = string.replace(html, " &#x2014; >", " -->")

            with self.open(name, "wb") as f:
                f.write(
                    html.encode(self.encoding_map.get(name,
                                                      self.used_encoding)))
コード例 #8
0
def smarten_punctuation(html, log=None):
    from calibre.utils.smartypants import smartyPants
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4
    start = 'calibre-smartypants-'+str(uuid4())
    stop = 'calibre-smartypants-'+str(uuid4())
    html = html.replace('<!--', start)
    html = html.replace('-->', stop)
    html = preprocessor.fix_nbsp_indents(html)
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
    return substitute_entites(html)
コード例 #9
0
def smarten_punctuation(html, log):
    from calibre.utils.smartypants import smartyPants
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4
    start = 'calibre-smartypants-' + str(uuid4())
    stop = 'calibre-smartypants-' + str(uuid4())
    html = html.replace('<!--', start)
    html = html.replace('-->', stop)
    html = preprocessor.fix_nbsp_indents(html)
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
    # convert ellipsis to entities to prevent wrapping
    html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
    # convert double dashes to em-dash
    html = re.sub(r'\s--\s', u'\u2014', html)
    return substitute_entites(html)
コード例 #10
0
ファイル: preprocess.py プロジェクト: 2014gwang/KindleEar
def smarten_punctuation(html, log):
    from calibre.utils.smartypants import smartyPants
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4
    start = 'calibre-smartypants-'+str(uuid4())
    stop = 'calibre-smartypants-'+str(uuid4())
    html = html.replace('<!--', start)
    html = html.replace('-->', stop)
    html = preprocessor.fix_nbsp_indents(html)
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
    # convert ellipsis to entities to prevent wrapping
    html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
    # convert double dashes to em-dash
    html = re.sub(r'\s--\s', u'\u2014', html)
    return substitute_entites(html)
コード例 #11
0
def smarten_punctuation(html, log):
    from calibre.utils.smartypants import smartyPants
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor

    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4

    start = "calibre-smartypants-" + str(uuid4())
    stop = "calibre-smartypants-" + str(uuid4())
    html = html.replace("<!--", start)
    html = html.replace("-->", stop)
    html = preprocessor.fix_nbsp_indents(html)
    html = smartyPants(html)
    html = html.replace(start, "<!--")
    html = html.replace(stop, "-->")
    # convert ellipsis to entities to prevent wrapping
    html = re.sub(r"(?u)(?<=\w)\s?(\.\s?){2}\.", "&hellip;", html)
    # convert double dashes to em-dash
    html = re.sub(r"\s--\s", u"\u2014", html)
    return substitute_entites(html)
コード例 #12
0
    def textile(self, text, rel=None, head_offset=0, html_type='xhtml'):
        """
        >>> import textile
        >>> textile.textile('some textile')
        u'\\t<p>some textile</p>'
        """
        self.html_type = html_type

        # text = type(u'')(text)
        text = _normalize_newlines(text)

        if self.restricted:
            text = self.encode_html(text, quotes=False)

        if rel:
            self.rel = ' rel="%s"' % rel

        text = self.getRefs(text)
        text = self.block(text, int(head_offset))
        text = self.retrieve(text)
        text = smartyPants(text, 'q')

        return text
コード例 #13
0
ファイル: functions.py プロジェクト: JapaChin/calibre
    def textile(self, text, rel=None, head_offset=0, html_type="xhtml"):
        """
        >>> import textile
        >>> textile.textile('some textile')
        u'\\t<p>some textile</p>'
        """
        self.html_type = html_type

        # text = unicode(text)
        text = _normalize_newlines(text)

        if self.restricted:
            text = self.encode_html(text, quotes=False)

        if rel:
            self.rel = ' rel="%s"' % rel

        text = self.getRefs(text)
        text = self.block(text, int(head_offset))
        text = self.retrieve(text)
        text = smartyPants(text, "q")

        return text