/
botcmd.py
112 lines (88 loc) · 2.43 KB
/
botcmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import urllib
import re
import threading
import HTMLParser
concmd = ['/load_blacklist']
blacklist_lock = threading.Lock()
blacklist = None
html_unescape = HTMLParser.HTMLParser().unescape
def load_blacklist():
global blacklist, blacklist_lock
blacklist_lock.acquire()
blacklist = []
f = open("blacklist.txt", 'r')
for line in f:
while line != '' and line[-1] == '\n':
line = line[:-1]
if line != '':
blacklist.append(re.compile('^' + line + '$'))
f.close()
blacklist_lock.release()
def matchprotocol(string, protocol):
return len(protocol) <= len(string) and string[:len(protocol)] == protocol
def getdomain(string):
string = string[string.index('://') + len('://'):]
return string[:string.index('/')]
def geturls(message):
protocols = ['http://', 'https://']
urls = []
# TODO: add support for other separators
for element in message.split():
for protocol in protocols:
if matchprotocol(element, protocol):
urls.append(element)
break
return urls
def unhtmlize(string):
string = string.replace('\n', ' ').replace('\t', ' ')
while ' ' in string:
string = string.replace(' ', ' ')
return html_unescape(string.decode('utf-8')).encode('utf-8')
def gettitle(f):
page = f.read()
if '<title>' not in page:
return None
page = page[page.index('<title>') + len('<title>'):]
if '</title>' not in page:
title = page
else:
title = page[:page.index('</title>')]
return unhtmlize(title)
def sanitize(string):
for i in ['\n', '\r'] + [chr(i) for i in range(32)]:
string = string.replace(i, '')
return string.strip()
def parse((line, irc)):
global blacklist, blacklist_lock
zwsp = '\xe2\x80\x8b'
line = line.replace('\x01', '').split(' ')
nick = line[0].split('!')[0][1:]
chan = line[2] if line[2][0]=='#' else nick
if line[1] == 'PRIVMSG':
message = ' '.join([line[3][1:]] + line[4:])
if message[:len(zwsp)] == zwsp:
return
urls = geturls(message)
for url in urls:
blacklisted = False
blacklist_lock.acquire()
for i in blacklist:
if i.match(url):
blacklisted = True
break
blacklist_lock.release()
if blacklisted:
continue
try:
f = urllib.urlopen(url)
except IOError:
continue
if f.info().gettype() == 'text/html':
title = sanitize(gettitle(f))
domain = sanitize(getdomain(url))
irc.msg(chan, zwsp + '%s: %s' % (domain, title))
f.close()
def execcmd(cmd):
if cmd[0] == '/load_blacklist':
load_blacklist()
load_blacklist()