def educateQuotes(text, language='en'): # type: (str, str) -> str """ Parameter: - text string (unicode or bytes). - language (`BCP 47` language tag.) Returns: The `text`, with "educated" curly quote characters. Example input: "Isn't this fun?" Example output: “Isn’t this fun?“; """ smart = smartquotes.smartchars(language) try: apostrophe = smart.apostrophe except Exception: apostrophe = u'’' # oldtext = text punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" # Special case if the very first character is a quote # followed by punctuation at a non-word-break. # Close the quotes by brute force: text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text) text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text) # Special case for double sets of quotes, e.g.: # <p>He said, "'Quoted' words in a larger quote."</p> text = re.sub(r""""'(?=\w)""", smart.opquote + smart.osquote, text) text = re.sub(r"""'"(?=\w)""", smart.osquote + smart.opquote, text) # Special case for decade abbreviations (the '80s): if language.startswith('en'): # TODO similar cases in other languages? text = re.sub(r"""'(?=\d{2}s)""", apostrophe, text, re.UNICODE) close_class = r"""[^\ \t\r\n\[\{\(\-]""" dec_dashes = r"""–|—""" # Get most opening single quotes: opening_single_quotes_regex = re.compile(r""" ( \s | # a whitespace char, or | # a non-breaking space entity, or -- | # dashes, or &[mn]dash; | # named dash entities %s | # or decimal entities &\#x201[34]; # or hex ) ' # the quote (?=\w) # followed by a word character """ % (dec_dashes,), re.VERBOSE | re.UNICODE) text = opening_single_quotes_regex.sub(r'\1' + smart.osquote, text) # In many locales, single closing quotes are different from apostrophe: if smart.csquote != apostrophe: apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) text = apostrophe_regex.sub(apostrophe, text) # TODO: keep track of quoting level to recognize apostrophe in, e.g., # "Ich fass' es nicht." closing_single_quotes_regex = re.compile(r""" (%s) ' (?!\s | # whitespace s\b | \d # digits ('80s) ) """ % (close_class,), re.VERBOSE | re.UNICODE) text = closing_single_quotes_regex.sub(r'\1' + smart.csquote, text) closing_single_quotes_regex = re.compile(r""" (%s) ' (\s | s\b) """ % (close_class,), re.VERBOSE | re.UNICODE) text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text) # Any remaining single quotes should be opening ones: text = re.sub(r"""'""", smart.osquote, text) # Get most opening double quotes: opening_double_quotes_regex = re.compile(r""" ( \s | # a whitespace char, or | # a non-breaking space entity, or -- | # dashes, or &[mn]dash; | # named dash entities %s | # or decimal entities &\#x201[34]; # or hex ) " # the quote (?=\w) # followed by a word character """ % (dec_dashes,), re.VERBOSE) text = opening_double_quotes_regex.sub(r'\1' + smart.opquote, text) # Double closing quotes: closing_double_quotes_regex = re.compile(r""" #(%s)? # character that indicates the quote should be closing " (?=\s) """ % (close_class,), re.VERBOSE) text = closing_double_quotes_regex.sub(smart.cpquote, text) closing_double_quotes_regex = re.compile(r""" (%s) # character that indicates the quote should be closing " """ % (close_class,), re.VERBOSE) text = closing_double_quotes_regex.sub(r'\1' + smart.cpquote, text) # Any remaining quotes should be opening ones. text = re.sub(r'"', smart.opquote, text) return text
def educateQuotes(text: str, language: str = 'en') -> str: """ Parameter: - text string (unicode or bytes). - language (`BCP 47` language tag.) Returns: The `text`, with "educated" curly quote characters. Example input: "Isn't this fun?" Example output: “Isn’t this fun?“; """ smart = smartquotes.smartchars(language) try: apostrophe = smart.apostrophe except Exception: apostrophe = '’' # oldtext = text punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" # Special case if the very first character is a quote # followed by punctuation at a non-word-break. # Close the quotes by brute force: text = re.sub(r"""^'(?=%s\\B)""" % (punct_class, ), smart.csquote, text) text = re.sub(r"""^"(?=%s\\B)""" % (punct_class, ), smart.cpquote, text) # Special case for double sets of quotes, e.g.: # <p>He said, "'Quoted' words in a larger quote."</p> text = re.sub(r""""'(?=\w)""", smart.opquote + smart.osquote, text) text = re.sub(r"""'"(?=\w)""", smart.osquote + smart.opquote, text) # Special case for decade abbreviations (the '80s): if language.startswith('en'): # TODO similar cases in other languages? text = re.sub(r"""'(?=\d{2}s)""", apostrophe, text, flags=re.UNICODE) close_class = r"""[^\ \t\r\n\[\{\(\-]""" dec_dashes = r"""–|—""" # Get most opening single quotes: opening_single_quotes_regex = re.compile( r""" ( \s | # a whitespace char, or | # a non-breaking space entity, or -- | # dashes, or &[mn]dash; | # named dash entities %s | # or decimal entities &\#x201[34]; # or hex ) ' # the quote (?=\w) # followed by a word character """ % (dec_dashes, ), re.VERBOSE | re.UNICODE) text = opening_single_quotes_regex.sub(r'\1' + smart.osquote, text) # In many locales, single closing quotes are different from apostrophe: if smart.csquote != apostrophe: apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) text = apostrophe_regex.sub(apostrophe, text) # TODO: keep track of quoting level to recognize apostrophe in, e.g., # "Ich fass' es nicht." closing_single_quotes_regex = re.compile( r""" (%s) ' (?!\s | # whitespace s\b | \d # digits ('80s) ) """ % (close_class, ), re.VERBOSE | re.UNICODE) text = closing_single_quotes_regex.sub(r'\1' + smart.csquote, text) closing_single_quotes_regex = re.compile( r""" (%s) ' (\s | s\b) """ % (close_class, ), re.VERBOSE | re.UNICODE) text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text) # Any remaining single quotes should be opening ones: text = re.sub(r"""'""", smart.osquote, text) # Get most opening double quotes: opening_double_quotes_regex = re.compile( r""" ( \s | # a whitespace char, or | # a non-breaking space entity, or -- | # dashes, or &[mn]dash; | # named dash entities %s | # or decimal entities &\#x201[34]; # or hex ) " # the quote (?=\w) # followed by a word character """ % (dec_dashes, ), re.VERBOSE) text = opening_double_quotes_regex.sub(r'\1' + smart.opquote, text) # Double closing quotes: closing_double_quotes_regex = re.compile( r""" #(%s)? # character that indicates the quote should be closing " (?=\s) """ % (close_class, ), re.VERBOSE) text = closing_double_quotes_regex.sub(smart.cpquote, text) closing_double_quotes_regex = re.compile( r""" (%s) # character that indicates the quote should be closing " """ % (close_class, ), re.VERBOSE) text = closing_double_quotes_regex.sub(r'\1' + smart.cpquote, text) # Any remaining quotes should be opening ones. text = re.sub(r'"', smart.opquote, text) return text