This repository has been archived by the owner on Nov 5, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
paste_grabber.py
executable file
·228 lines (198 loc) · 7.67 KB
/
paste_grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python
import itertools as it, operator as op, functools as ft
from glob import glob
from fnmatch import fnmatch
from hashlib import sha1
from io import open
import os, sys, re, logging
from twisted.internet import inotify, reactor, protocol, defer
from twisted.internet.utils import getProcessOutputAndValue
from twisted.web.client import downloadPage
from twisted.python.filepath import FilePath
from twisted.python import log
class PasteGrabber(object):
@staticmethod
def file_end_mark(path, size=200, pos=None, data=None):
if not pos:
with path.open() as src:
if not data:
pos = None
while pos != src.tell(): # to ensure that file didn't grow in-between
pos = os.fstat(src.fileno()).st_size
src.seek(-min(pos, size), os.SEEK_END)
data = src.read()
else:
pos = os.fstat(src).st_size
size, data = len(data), sha1(data).hexdigest()
return pos, size, data
@staticmethod
def file_end_check(path, pos, size=None, data=None):
if pos != path.getsize(): return False
elif size and data:
with path.open() as src:
src.seek(-size, os.SEEK_END)
if sha1(src.read()).hexdigest() != data: return False
return True
def __init__(self, path_mask, dst_path):
self.dst_path = FilePath(dst_path)
paths_pos = self.paths_pos = dict()
paths_watch = self.paths_watch = dict()
self.paths_buff = dict()
self.notifier = inotify.INotify()
self.notifier.startReading()
raise NotImplementedError('''paths_watch calculation code below is
known to have simple bug, which prohibits processing of any new paths matching
the specified masks.
Fixed version can be found in (and copy-pasted from) bordercamp-irc-bot
project "logtail" relay:
https://github.com/mk-fg/bordercamp-irc-bot/blob/master/bordercamp/relays/logtail.py
This error is here because I don't use this script atm and don't have time to
test the code after copy-pasting right now, mostly as a reminder.''')
for path in it.imap(FilePath, glob(path_mask)):
path_real = path.realpath()
# Matched regular files are watched as a basename pattern in the dir
if path_real.isfile():
path_dir = path.parent().realpath()
if path_dir not in paths_watch:
paths_watch[path_dir] = {path.basename()}
else: paths_watch[path_dir].add(path.basename())
# All files in the matched dirs are watched, non-recursively
elif path_real.isdir():
if path_real not in paths_watch: paths_watch[path_real] = {'*'}
else: paths_watch[path_real].add('*')
for name in path_real.listdir():
path_child = path_real.child(name).realpath()
# Specials of any kind are ignored
else: log.debug('Skipping non-file/dir path: {}'.format(path_real))
for path in paths_watch:
log.debug('Adding watcher to path: {}'.format(path))
self.notifier.watch( path,
mask=inotify.IN_CREATE | inotify.IN_MODIFY,
callbacks=[self.handle_change] )
def handle_change(self, stuff, path, mask):
mask_str = inotify.humanReadableMask(mask)
log.noise('Event: {} ({})'.format(path, mask_str))
## Filtering
path_real = path.realpath()
if not path_real.isfile():
log.debug( 'Ignoring event for'
' non-regular file: {} (realpath: {})'.format(path, path_real) )
return
dir_key = path_real.parent().realpath()
if dir_key not in self.paths_watch:
log.warn( 'Ignoring event for file outside of watched'
' set of paths: {} (realpath: {})'.format(path, path_real) )
return
for pat in self.paths_watch[dir_key]:
if fnmatch(bytes(path.basename()), pat): break
else:
log.noise( 'Non-matched path in one of'
' the watched dirs: {} (realpath: {})'.format(path, path_real) )
return
## Get last position
if self.paths_pos.get(path_real) is not None:
pos, size, data = self.paths_pos[path_real]
if self.file_end_check(path_real, pos, size=size, data=data):
log.debug(( 'Event (mask: {}) for unchanged'
' path, ignoring: {}' ).format(mask_str, path))
return
if path_real.getsize() < pos:
log.debug( 'Detected truncation'
' of a path, rewinding: {}'.format(path) )
pos = None
else: pos = None
## Actual processing
line = self.paths_buff.setdefault(path_real, '')
with path_real.open('rb') as src:
if pos:
src.seek(pos)
pos = None
while True:
buff = src.readline()
if not buff: # eof
self.paths_pos[path_real] = self.file_end_mark(path_real, data=line)
line += buff
if line.endswith('\n'):
log.noise('New line (source: {}): {!r}'.format(path, line))
reactor.callLater(0, self.handle_line, line)
line = self.paths_buff[path_real] = ''
else:
line, self.paths_buff[path_real] = None, line
break
@defer.inlineCallbacks
def handle_line(self, line, repo_lock=defer.DeferredLock()):
try:
line = line.decode('utf-8', 'ignore').strip()
match = re.search(r'(^|\s+)!pq\s+(?P<link>\S+)(\s+::\S+|$)', line)
if not match:
log.noise('Non-patchbot line, ignoring: {}'.format(line.encode('utf-8', 'ignore')))
defer.returnValue(None)
link = match.group('link').encode('ascii')
if not re.search('https?://', link, re.IGNORECASE):
log.warn('Incorrect non-http link, skipping: {}'.format(link))
defer.returnValue(None)
except UnicodeError as err:
log.warn('Failed to recode line ({!r}): {}'.format(line, err))
defer.returnValue(None)
## Grab the patch
dst_base = '{}.patch'.format(sha1(link).hexdigest())
dst_path = self.dst_path.child(dst_base)
if dst_path.exists():
log.debug( 'Patch already exists'
' (file: {}, link: {}), skipping'.format(dst_path, link) )
defer.returnValue(None)
# Not via tmpfile to prevent multiple downloads of the same paste
try: yield downloadPage(link, dst_path.open('wb'), timeout=120)
except:
if dst_path.exists(): dst_path.remove()
raise
## Commit into repo and push
yield repo_lock.acquire()
try:
for cmd, check in [
(['add', dst_base], True),
(['commit', '-m', 'New patch: {}'.format(link)], False),
(['push'], True) ]:
out, err, code = yield getProcessOutputAndValue(
'/usr/bin/git', cmd, path=self.dst_path.path )
if check and code:
log.error('\n'.join([
'Failed to commit/push new patch into repo',
'Command: {}'.format(cmd), 'Exit code: {}'.format(code),
'Stdout:\n {}'.format('\n '.join(out.splitlines())),
'Stderr:\n {}'.format('\n '.join(err.splitlines())) ]))
break
else: log.debug('Successfully pushed paste: {}'.format(link))
finally: repo_lock.release()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Watch IRC logs from a specified path and download'
' all the zebrapig patchbot requests to a given dst_path.')
parser.add_argument('path_mask',
help='Glob pattern of IRC logs to watch (can be a dir or dir-glob).')
parser.add_argument('dst_path',
help='Dir to download all the patches to.')
parser.add_argument('--debug',
action='store_true', help='Verbose operation mode.')
parser.add_argument('--noise',
action='store_true', help='Even more verbose mode than --debug.')
optz = parser.parse_args()
logging.NOISE = logging.DEBUG - 1
logging.addLevelName(logging.NOISE, 'NOISE')
if optz.noise: lvl = logging.NOISE
elif optz.debug: lvl = logging.DEBUG
else: lvl = logging.WARNING
logging.basicConfig(level=lvl)
log.PythonLoggingObserver().start()
for lvl in 'noise', 'debug', 'info', ('warning', 'warn'), 'error':
lvl, func = lvl if isinstance(lvl, tuple) else (lvl, lvl)
assert not getattr(log, lvl, False)
setattr(log, func, ft.partial( log.msg,
logLevel=logging.getLevelName(lvl.upper()) ))
# Check permissions
os.listdir(os.path.dirname(optz.path_mask))
os.listdir(optz.dst_path)
tailer = PasteGrabber(optz.path_mask, optz.dst_path)
log.debug('Starting event loop')
reactor.run()