/
sanitise.py
executable file
·65 lines (50 loc) · 1.93 KB
/
sanitise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
# Copyright 2011 Tom Vincent <http://www.tlvince.com/contact/>
"""Sanitise the given string(s) into (a subset of) ASCII."""
import argparse
import string
import unicodedata
import re
import random
def removeAccents(str):
"""Remove any form of UTF-8 accents.
See: http://stackoverflow.com/questions/517923/
"""
nkfd_form = unicodedata.normalize('NFKD', str)
return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def regexSanitise(str):
"""Perform detailed sanitising substitutions using regex."""
# List of (pattern, replacement) tuples
regex = [
("&", "and"), # Replace ampersand with a safe string
("( |_)", "-"), # See: http://webmasters.stackexchange.com/q/374
("(\.|-){2,}", "\\1"), # Flatten a series of two or more dots or dashes
("^-", ""), # Remove a leading dash
("(-$|\.$)", ""), # Remove a trailing dash or dot
]
for handler in regex:
pattern, replacement = handler
str = re.sub(pattern, replacement, str)
return str
def sanitise(str):
"""Perform substitutions and return the string."""
str = str.lower()
str = removeAccents(str)
str = regexSanitise(str)
# Permit only letters, digits, dash (seperator) and dot (file extension)
valid = string.ascii_lowercase + string.digits + "-."
str = "".join([chr for chr in str if chr in valid])
if not str:
str = "untitled-" + "".join(random.sample(valid[:-2], 6))
return str
def parseArguments():
"""Parse the command-line arguments."""
parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
parser.add_argument("strings", nargs="+", help="the string(s) to sanitise")
return parser.parse_args()
def main():
"""Start execution of sanitise."""
args = parseArguments()
[print(sanitise(s)) for s in args.strings]
if __name__ == "__main__":
main()