Example #1
0
def titlecase(text: str) -> str:
	"""
	Titlecase a string according to SE house style.

	INPUTS
	text: The string to titlecase

	OUTPUTS
	A titlecased version of the input string
	"""

	text = pip_titlecase(text)

	# We make some additional adjustments here

	# Lowercase HTML tags that titlecase might have screwed up. We just lowercase the entire contents of the tag, including attributes,
	# since they're typically lowercased anyway. (Except for things like `alt`, but we won't be titlecasing images!)
	text = regex.sub(r"<(/?)([^>]+?)>", lambda result: "<" + result.group(1) + result.group(2).lower() + ">", text)

	# Lowercase leading "d', as in "Marie d'Elle"
	text = regex.sub(r"\bD’([A-Z]+?)", "d’\\1", text)

	# Lowercase "and", even if preceded by punctuation
	text = regex.sub(r"([^a-zA-Z]) (And|Or)\b", lambda result: result.group(1) + " " + result.group(2).lower(), text)

	# pip_titlecase capitalizes *all* prepositions preceded by parenthesis; we only want to capitalize ones that *aren't the first word of a subtitle*
	# OK: From Sergeant Bulmer (of the Detective Police) to Mr. Pendril
	# OK: Three Men in a Boat (To Say Nothing of the Dog)
	text = regex.sub(r"\((For|Of|To)(.*?)\)(.+?)", lambda result: "(" + result.group(1).lower() + result.group(2) + ")" + result.group(3), text)

	# Lowercase "and", if followed by a word-joiner
	regex_string = r"\bAnd{}".format(se.WORD_JOINER)
	text = regex.sub(regex_string, "and{}".format(se.WORD_JOINER), text)

	# Lowercase "in", if followed by a semicolon (but not words like "inheritance")
	text = regex.sub(r"\b; In\b", "; in", text)

	# Lowercase "from", "with", as long as they're not the first word and not preceded by a parenthesis
	text = regex.sub(r"(?<!^)(?<!\()\b(From|With)\b", lambda result: result.group(1).lower(), text)

	# Capitalise the first word after an opening quote or italicisation that signifies a work
	text = regex.sub(r"(‘|“|<i.*?epub:type=\".*?se:.*?\".*?>)([a-z])", lambda result: result.group(1) + result.group(2).upper(), text)

	# Lowercase "the" if preceded by "vs."
	text = regex.sub(r"(?:vs\.) The\b", "vs. the", text)

	# Lowercase "de", "von", "van", "le", as in "Charles de Gaulle", "Werner von Braun", etc., and if not the first word and not preceded by an &ldquo;
	text = regex.sub(r"(?<!^|“)\b(De|Von|Van|Le)\b", lambda result: result.group(1).lower(), text)

	# Uppercase word following "Or,", since it is probably a subtitle
	text = regex.sub(r"\bOr, ([a-z])", lambda result: "Or, " + result.group(1).upper(), text)

	# Fix html entities
	text = text.replace("&Amp;", "&amp;")

	# Lowercase etc.
	text = text.replace("Etc.", "etc.")

	return text
Example #2
0
def titlecase(text):
    text = pip_titlecase(text)

    # We make some additional adjustments here

    # Lowercase HTML tags that titlecase might have screwed up. We just lowercase the entire contents of the tag, including attributes,
    # since they're typically lowercased anyway. (Except for things like `alt`, but we won't be titlecasing images!)
    text = regex.sub(
        r"<(/?)([^>]+?)>",
        lambda result: "<" + result.group(1) + result.group(2).lower() + ">",
        text)

    # Lowercase leading "d', as in "Marie d'Elle"
    text = regex.sub(r"\bD’([A-Z]+?)", "d’\\1", text)

    # Lowercase "and", even if preceded by punctuation
    text = regex.sub(
        r"([^a-zA-Z]) (And|Or)",
        lambda result: result.group(1) + " " + result.group(2).lower(), text)

    # pip_titlecase capitalizes *all* prepositions preceded by parenthesis; we only want to capitalize ones that *aren't the first word of a subtitle*
    # OK: From Sergeant Bulmer (of the Detective Police) to Mr. Pendril
    # OK: Three Men in a Boat (To Say Nothing of the Dog)
    text = regex.sub(
        r"\((For|Of|To)(.*?)\)(.+?)", lambda result: "(" + result.group(1).
        lower() + result.group(2) + ")" + result.group(3), text)

    # Lowercase "and", if followed by a word-joiner
    regex_string = r"\bAnd{}".format(se.WORD_JOINER)
    text = regex.sub(regex_string, "and{}".format(se.WORD_JOINER), text)

    # Lowercase "from", "with", as long as they're not the first word and not preceded by a parenthesis
    text = regex.sub(r"(?<!^)(?<!\()\b(From|With)\b",
                     lambda result: result.group(1).lower(), text)

    # Lowercase "the" if preceded by "vs."
    text = regex.sub(r"(?:vs\.) The\b", "vs. the", text)

    # Lowercase "de", "von", "le", as in "Charles de Gaulle", "Werner von Braun", and if not the first word
    text = regex.sub(r"(?<!^)\b(De|Von|Le)\b",
                     lambda result: result.group(1).lower(), text)

    # Fix html entities
    text = text.replace("&Amp;", "&amp;")

    # Lowercase etc.
    text = text.replace("Etc.", "etc.")

    return text
Example #3
0
def titlecase(text: str) -> str:
	"""
	Titlecase a string according to SE house style.

	INPUTS
	text: The string to titlecase

	OUTPUTS
	A titlecased version of the input string
	"""

	# For some reason, pip_titlecase() doesn't do anything if the string is mostly (but not all) uppercase.
	# For example "STOPPING BY WOODS ON a SNOWY EVENING" would not be changed by pip_titlecase()
	# So, convert to all lowercase first.
	text = text.lower()

	text = pip_titlecase(text)

	# We make some additional adjustments here

	# Lowercase HTML tags that titlecase might have screwed up. We just lowercase the entire contents of the tag, including attributes,
	# since they're typically lowercased anyway. (Except for things like `alt`, but we won't be titlecasing images!)
	text = regex.sub(r"<(/?)([^>]+?)>", lambda result: "<" + result.group(1) + result.group(2).lower() + ">", text)

	# Uppercase Roman numerals, but only if they are valid Roman numerals
	try:
		text = regex.sub(r"(\s)([ivxlcdm]+)(\s|$)", lambda result: result.group(1) + result.group(2).upper() + result.group(3) if roman.fromRoman(result.group(2).upper()) else result.group(2), text, flags=regex.IGNORECASE)
	except roman.InvalidRomanNumeralError:
		pass

	# Lowercase "and" and "or", even if preceded by punctuation
	text = regex.sub(r"([^\p{Letter}]) (And|Or)\b", lambda result: result.group(1) + " " + result.group(2).lower(), text)

	# pip_titlecase capitalizes *all* prepositions preceded by parenthesis; we only want to capitalize ones that *aren't the first word of a subtitle*
	# OK: From Sergeant Bulmer (of the Detective Police) to Mr. Pendril
	# OK: Three Men in a Boat (To Say Nothing of the Dog)
	text = regex.sub(r"\((For|Of|To)(.*?)\)(.+?)", lambda result: "(" + result.group(1).lower() + result.group(2) + ")" + result.group(3), text)

	# Uppercase words preceded by en or em dash
	text = regex.sub(fr"([—–]{se.WORD_JOINER}?)([\p{{Lowercase_Letter}}])", lambda result: result.group(1) + result.group(2).upper(), text)

	# Lowercase "and", if it's not the very first word, and not preceded by an em-dash
	text = regex.sub(r"(?<!^)\bAnd\b", r"and", text)

	# Lowercase "in", if followed by a semicolon (but not words like "inheritance")
	text = regex.sub(r"\b; In\b", "; in", text)

	# Lowercase th', sometimes used poetically
	text = regex.sub(r"\b Th’ \b", " th’ ", text)

	# Uppercase words that begin compound words, like "to-night" (which might appear in poetry)
	text = regex.sub(r" ([\p{Lowercase_Letter}])([\p{Lowercase_Letter}]+\-)", lambda result: " " + result.group(1).upper() + result.group(2), text)

	# Lowercase "from", "with", as long as they're not the first word and not preceded by a parenthesis
	text = regex.sub(r"(?<!^)(?<!\()\b(From|With)\b", lambda result: result.group(1).lower(), text)

	# Capitalise the first word after an opening quote or italicisation that signifies a work
	text = regex.sub(r"(‘|“|<i.*?epub:type=\".*?se:.*?\".*?>)([\p{Lowercase_Letter}])", lambda result: result.group(1) + result.group(2).upper(), text)

	# Lowercase "the" if preceded by "vs."
	text = regex.sub(r"(?:vs\.) The\b", "vs. the", text)

	# Lowercase "de", "von", "van", "le", as in "Charles de Gaulle", "Werner von Braun", etc., and if not the first word and not preceded by an &ldquo;
	text = regex.sub(r"(?<!^|“)\b(De|Von|Van|Le)\b", lambda result: result.group(1).lower(), text)

	# Uppercase word following "Or,", since it is probably a subtitle
	text = regex.sub(r"\bOr, ([\p{Lowercase_Letter}])", lambda result: "Or, " + result.group(1).upper(), text)

	# Uppercase word following ":", except "or, ", which indicates a kind of subtitle
	text = regex.sub(r": ([\p{Lowercase_Letter}])(?!r, )", lambda result: ": " + result.group(1).upper(), text)

	# Uppercase words after an initial contraction, like O'Keefe or L'Affaire. But only if there's at least 3 letters
	# after, to prevent catching things like I'm or E're
	text = regex.sub(r"\b([\p{Uppercase_Letter}]’)([\p{Lowercase_Letter}])([\p{Letter}]{2,})", lambda result: result.group(1) + result.group(2).upper() + result.group(3), text)

	# Uppercase letter after Mc
	text = regex.sub(r"\bMc([\p{Lowercase_Letter}])", lambda result: "Mc" + result.group(1).upper(), text)

	# Uppercase first letter after beginning contraction
	text = regex.sub(r"(\s|^)(’[\p{Lowercase_Letter}])", lambda result: result.group(1) + result.group(2).upper(), text)

	# Uppercase first letter
	text = regex.sub(r"^(\p{Lowercase_Letter}])", lambda result: result.group(1).upper(), text)

	# Lowercase 'by'
	text = regex.sub(r"(\s)By(\s|%)", lambda result: result.group(1) + "by" + result.group(2), text)

	# Lowercase leading "d', as in "Marie d'Elle"
	text = regex.sub(r"(?:\b|^)D’([\p{Letter}])", lambda result: "d’" + result.group(1).upper(), text)

	# # Uppercase letter after leading "L', as in "L'Affaire"
	# text = regex.sub(r"(?:\b|^)L’([\p{Letter}])", lambda result: "L’" + result.group(1).upper(), text)

	# Uppercase some known initialisms
	text = regex.sub(r"(\s|^)(sos|md)(?:\b|$)", lambda result: result.group(1) + result.group(2).upper(), text, flags=regex.IGNORECASE)
	text = regex.sub(r"(\s)(bc|ad)(?:\b|$)", lambda result: result.group(1) + result.group(2).upper(), text, flags=regex.IGNORECASE)

	# Lowercase À (as in À La Carte) unless it's the first word
	text = regex.sub(r"(?<!^)\bÀ\b", "à", text)

	# Uppercase initialisms
	text = regex.sub(r"(\s)(([\p{Letter}]\.)+)", lambda result: result.group(1) + result.group(2).upper(), text)

	# Fix html entities
	text = text.replace("&Amp;", "&amp;")

	# Lowercase etc.
	text = text.replace("Etc.", "etc.")

	return text