-
Notifications
You must be signed in to change notification settings - Fork 0
/
url_shortener.py
534 lines (499 loc) · 21.2 KB
/
url_shortener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
# Miscellaneous
import contextlib
import datetime
import functools
import os
import signal
import subprocess
import time
import traceback
# Multiprocessing
import multiprocessing
from queue import Empty as EmptyQueue
# Web server
import socket
import tornado.ioloop
import tornado.web
# URL parsing
import urllib.parse
import url_normalize
# Database
import sqlitedict
# The classes in this module implement the URL shortener web service.
# Below is a quick overview of these classes and their relations.
# - Service: This is the top-most class, storing the web application
# as a Tornado application + HTTP server.
# - MainHandler and RedirectHandler: These HTTP request handlers take care
# of the GET+POST requests on the root page, and the redirection (GET)
# requests on any subpage, respectively. They live on the Tornado
# application on the server object.
# - Database: Small class representing the two key-value stores.
# Lives on the service object.
# - Encoder: General class for implementing numeric encodings,
# i.e. transformations between integers and their str representation
# in another base. Used to map integers to alphanumeric strs
# (short URLs). Lives on the service object.
# - LruCache: Simple `last recently used` cache, relying on dicts bein
# insertion ordered hash maps (CPython 3.6+, Python 3.7+). This is used
# to cache communicating between server processes and the
# database processes (the normal functools.lru_cache can not be used
# with coroutines). Lives on the service object.
# Finally, this module defines a single function, start_service,
# which can be used to start the web service from another Python script.
# Alternatively, you may run this file as a script, in which case this
# function is called automatically.
# Last recently used cache, used to cache the
# database lookups on the server processes.
class LruCache:
def __init__(self, capacity=1024, data=None):
self.capacity = capacity
if data is None:
data = {}
self.data = dict(data)
def get(self, key):
try:
val = self[key]
except KeyError:
return None
return val
def __getitem__(self, key):
val = self.data[key]
# If here, lookup was succesfull.
# Reinsert the item, so that it appears at the top.
self.data.pop(key)
self.data[key] = val
return val
def __setitem__(self, key, val):
# Pop if key already exists,
# so that when inserted it appears at the end.
try:
self.data.pop(key)
except KeyError:
pass
# Pop first used if full
if len(self) == self.capacity:
self.data.pop(next(iter(self.data)))
self.data[key] = val
def __len__(self):
return len(self.data)
# Class handling numeric encoding of numbers in any base
class Encoder:
def __init__(self, alphabet):
self.alphabet = alphabet
self.base = len(self.alphabet)
def encode(self, number):
digits = []
while number:
number_new = number//self.base
remainder = number - number_new*self.base
number = number_new
digits.append(remainder)
if not digits:
digits = [0]
return ''.join(self.alphabet[digit] for digit in reversed(digits))
# Class representing the sqlite key-value database
class Database:
def __init__(self, filename):
# The database consists of two files, one for mapping long
# URLs to short URLs, and one for the reverse mapping.
if '.' in filename:
left, dot, right = filename.rpartition('.')
else:
left, dot, right = filename, '', ''
self.filenames = {
'long2short': f'{left}_l2s{dot}{right}',
'short2long': f'{left}_s2l{dot}{right}',
}
# Context manager for opening a table within the database
@contextlib.contextmanager
def open(self, mapping, *args, **kwargs):
with sqlitedict.SqliteDict(self.filenames[mapping], *args, **kwargs) as table:
# Yield control (and the table) back to the caller
yield table
# Commit changes to disk before closing file
table.commit()
# The main HTML request handler, responsible for handling
# GET and POST requests to the root page.
class MainHandler(tornado.web.RequestHandler):
# At initialization time, store a reference to the top-level
# service object, from which everything else is reachable.
def initialize(self, service):
self.service = service
# Method for printing a message at each request
def verbose_message(self, status):
date = datetime.datetime.now().strftime('%d/%b/%Y %H:%M:%S')
print(
f'[{self.request.remote_ip}'
f' - {date}'
f' - {self.request.method.ljust(4)}'
f' - {self.request.version}'
f' - {status}]'
)
# Method quite similar to the builtin send_error,
# though with a custom message.
async def send_error_emulate(self, status, message=''):
self.clear()
self.set_status(status)
self.finish(f'<html><body><b>{status}</b><br>{message}</body></html>')
# Method for generating HTML responses
@functools.lru_cache(1024)
def generate_html(self, url_long='', url_short=''):
"""Without arguments, this returns the HTML for the
default root page. With arguments, additional HTML is added,
showing the result of the query (url_short) as well as the
query itself (url_long).
You should never pass in only one of the URLs.
"""
result = []
if url_short:
result = [
f'Shortened URL:<br>'
f'<a id="url_short" href="{url_short}">{url_short}</a><br>',
f'<button onClick="copyToClipboard(\'url_short\')">Copy to clipboard</button>',
f'<script>',
f'function copyToClipboard(id) {{',
f' var from = document.getElementById(id);',
f' var range = document.createRange();',
f' window.getSelection().removeAllRanges();',
f' range.selectNode(from);',
f' window.getSelection().addRange(range);',
f' document.execCommand(\'copy\');',
f' window.getSelection().removeAllRanges();',
f'}}',
f'</script>',
]
# Return HTML as single str
html = '\n'.join([
# Concatenate default HTML response with potential result
f'<html>',
f'<body>',
f' <h2>URL shortener!</h2>',
f' Long URL goes here:<br>'
f' <form method="POST">',
f' <input type="text" value="{url_long}" name="url_long" size="64" /><br>',
f' <input type="submit" value="Submit" />',
f' </form>',
*result,
f'</body>',
f'</html>',
])
return html
# Method for handling GET requests,
# i.e. sending back the default root page.
async def get(self):
if self.service.verbosity > 1:
self.verbose_message(200)
await self.finish(self.generate_html())
# Method for handling POST requests,
# i.e. accepting a long URL and sending back
# the corresponding short URL.
async def post(self):
# Extract long URL
url_long = self.get_body_argument('url_long')
if not url_long:
# Empty url_long. Send back default HTML.
if self.service.verbosity > 1:
self.verbose_message(200)
self.finish(self.generate_html())
return
# Long URL received. Canonicalize it.
try:
url_long = url_normalize.url_normalize(url_long)
except UnicodeError:
await self.send_error_emulate(400, f'Failed to interpret URL "{url_long}"')
return
# Get short URL and send back response
url_short = await self.service.lookup(url_long, tablename='long2short')
# Assemble full version of short URL
url_short = f'{self.service.get_host_url(self.request)}/{url_short}'
if self.service.verbosity > 1:
self.verbose_message(200)
self.finish(self.generate_html(url_long, url_short))
# HTML request handler responsible for handling redirections,
# i.e. GET request to pages deeper than the root.
class RedirectHandler(MainHandler):
async def get(self, path):
# Browsers may ask for a favicon.ico. We do not have any.
if path == 'favicon.ico':
await self.send_error_emulate(404, 'No favicon available')
return
# The path should be a short URL. Look it up in the database.
url_long = await self.service.lookup(path, tablename='short2long')
if url_long:
# Long URL retrieved from database. Redirect.
status = 307
if self.service.verbosity > 1:
self.verbose_message(status)
self.redirect(url_long, status=status)
return
# Database lookup failed
await self.send_error_emulate(404, f'Destination not found: "{path}"')
# Class representing the web service.
# It has the Tornado server and application as attributes.
class Service:
def __init__(self, address=None, port=None, database=None, nprocs=None, verbosity=None):
# Store the supplied specifications.
# If not supplied, retrieve from environment.
# If not in environment, use sensible defaults.
if address is None:
address = os.environ.get('address', 'localhost')
if port is None:
port = int(os.environ.get('port', 8000))
if database is None:
database = os.environ.get('database', 'db.sqlite')
if nprocs is None:
nprocs = int(os.environ.get('nprocs', 4))
if verbosity is None:
verbosity = int(os.environ.get('verbosity', 1))
self.address = address
self.port = port
self.database_filename = database # Note the name change
self.nprocs = nprocs
self.verbosity = verbosity
if nprocs < 1:
raise ValueError('You must have at least nprocs = 1 server processes')
# The process ID of the process used to start the service.
# This is used in the stop() method.
self.master_pid = os.getpid()
# Instantiate Encoder instance, mapping non-negative integers
# to alphanumeric strs.
self.encoder = Encoder(''.join(
[chr(i) for i in range(48, 58)] # 0-9
+ [chr(i) for i in range(65, 91)] # A-Z
+ [chr(i) for i in range(97, 123)] # a-z
))
# Instantiate database and caches
self.database = Database(self.database_filename)
self.database_cache = {'long2short': LruCache(), 'short2long': LruCache()}
# Tuples of multiprocessing queues, for communicating
# back and forth between server processes and the
# database process, one queue for each server process.
# We can use either multiprocessing.Queue or
# multiprocessing.Manager.Queue(). For some reason,
# using a manager is faster. The manager creation itself
# however forks off a process, which does not play nice
# with the Tornado multiprocessing. Hence the ugly hack
# in the stop() method.
manager = multiprocessing.Manager()
self.queues_server2db = tuple(manager.Queue() for _ in range(self.nprocs))
self.queues_db2server = tuple(manager.Queue() for _ in range(self.nprocs))
# Integer ID for the server process.
# This attribute is local to each process.
self.server_process_id = None
# Method for starting the web server using multiprocessing.
# This should be called from a single process.
def start(self):
if self.verbosity > 0:
print(
f'Starting URL shortener web service, using\n'
f' address = {self.address}\n'
f' port = {self.port}\n'
f' database = {self.database_filename}\n'
f' nprocs = {self.nprocs}\n'
f' verbosity = {self.verbosity}\n'
,
end='',
flush=True,
)
# Spin up Tornado server
self.application = tornado.web.Application(
[
(r'^/$', MainHandler, dict(service=self)),
(r'/(.+)', RedirectHandler, dict(service=self)),
],
debug=False,
)
self.server = tornado.httpserver.HTTPServer(self.application)
self.server.listen(self.port, self.address)
# We want to let self.nprocs concurrent process work as
# server processs, while 1 process takes care of the database.
# In order for the multiprocessing module to play nicely with
# the multiprocessing of Tornado, the database process should
# be forked from a server process. We thus fork off
# self.nprocs server processes, each of which is given an
# integer ID. The one with an ID of zero will be responsible
# for forking off and joining the database process.
# Note that this zero ID process is different from the
# master process; the original Python process.
queue = multiprocessing.Queue()
for i in range(self.nprocs):
queue.put(i)
self.server.start(self.nprocs)
self.server_process_id = queue.get()
if self.server_process_id == 0:
self.database_process = multiprocessing.Process(target=self.handle_database)
self.database_process.start()
# Start of event loop
tornado.ioloop.IOLoop.current().start()
# Method for stopping the running service.
# This should be called by all processes spawned by the server.
def stop(self):
# Let the master process do the printing
if self.verbosity > 0 and os.getpid() == self.master_pid:
print('\nStopping URL shortener web service', flush=True)
# The zero ID process joins back the database process
if self.server_process_id == 0:
self.database_process.join()
self.database_process.close()
# Stop server and event loop
self.server.stop()
ioloop = tornado.ioloop.IOLoop.instance()
ioloop.add_callback(ioloop.stop)
# In a perfect world, all forked off subprocesses should now
# be joined. However, our use of multiprocessing from both
# Tornado and Python somehow leaves child processes still
# running, which will be wrongly joined at Python exit,
# leading to uncatchable exceptions. It is no big deal,
# but the large tracebacks are annoying.
# As a very hacky solution, we send all processes to sleep
# except the master process, i.e. the one originally
# responsible for launching the service, and thus a common
# ancestor to all other running processes. This process then
# gets the process ID of all its descendants (through a hacky
# call to the system ps command), and then asks the system to
# kill these.
if os.getpid() != self.master_pid:
time.sleep(1)
return # Ought to be killed before reaching here
pids = sorted(
int(line.strip().split()[0].strip())
for line in subprocess.check_output(
f'ps -o pid,cmd -g $(ps -o sid= -p {self.master_pid})', shell=True,
).decode().split('\n')
if line.strip() and 'python' in line
)
for pid in pids:
if pid > self.master_pid:
try:
os.kill(pid, signal.SIGTERM)
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
# Method for getting the URL of the host.
# This is really already known information,
# except when 'localhost' is used for the address.
def get_host_url(self, request, cache=[]):
if cache:
# One-element cache for performance
return cache[0]
# Construct URL including protocol
hostname = urllib.parse.urlparse(
f'{request.protocol}://{request.host}'
).hostname
address = socket.gethostbyname(hostname)
url = f'{request.protocol}://{address}'
# Append port
if ':' in request.host:
url += ':' + request.host.split(':')[-1]
cache.append(url)
return self.get_host_url(request)
# Method returning the short URL matching the integer n
def short_url_from_int(self, n):
return self.encoder.encode(n)
# Method in which the database process is trapped while the service
# is running. It communicates with the server processes using
# multiprocessing queues.
def handle_database(self):
# Open both database files simultaneously.
# These wll be held open until the service shuts down.
with self.database.open('long2short') as long2short, \
self.database.open('short2long') as short2long:
# Commit changes to both files periodically
t_between_commits = 1 # In seconds
t_last_commit = time.time()
# As this function is run through multiprocessing.Process,
# exception handling in outer functions do not apply.
try:
while True:
for queue_server2db, queue_db2server in zip(
self.queues_server2db, self.queues_db2server,
):
try:
tablename, url = queue_server2db.get(block=False)
# Database request received
if tablename == 'long2short':
# Get short URL from long URL. If not
# present in the database, add it.
url_result = long2short.get(url)
if url_result is None:
url_result = long2short[url] = self.short_url_from_int(len(long2short))
# Also add reverse mapping
short2long[url_result] = url
elif tablename == 'short2long':
# Get long URL from short URL.
# Nevermind if not present in database.
url_result = short2long.get(url)
queue_db2server.put(url_result)
except EmptyQueue:
pass
# If it is time, do commit
t_now = time.time()
if t_now - t_last_commit > t_between_commits:
short2long.commit()
long2short.commit()
t_last_commit = t_now
except KeyboardInterrupt:
pass
except Exception:
traceback.print_exc()
# Property methods for easily getting the queue
# belonging to the running server process.
@property
def queue_server2db(self):
return self.queues_server2db[self.server_process_id]
@property
def queue_db2server(self):
return self.queues_db2server[self.server_process_id]
# Method for looking up URLs in the database
async def lookup(self, url, tablename):
# Look for result in cache
cache = self.database_cache[tablename]
result = cache.get(url)
if result is not None:
return result
# Not found in cache. Query the database process.
item = (tablename, url)
self.queue_server2db.put(item)
result = self.queue_db2server.get()
cache[url] = result
return result
# The only top level function,
# used to run the web service.
def start_service(address=None, port=None, database=None, nprocs=None, verbosity=None,
*, block=True):
def start():
service = Service(
address=address,
port=port,
database=database,
nprocs=nprocs,
verbosity=verbosity,
)
# Encapsulate running service, ensuring proper shutdown
try:
service.start()
except KeyboardInterrupt:
pass
except Exception:
traceback.print_exc()
finally:
service.stop()
if block:
# Start the service directly.
# This will block until the service is stopped.
start()
else:
# Start the service in a new process.
# A function is returned, which when called stops the service.
process = multiprocessing.Process(target=start)
process.start()
time.sleep(1) # Let service initialize completely
return lambda sig=signal.SIGINT: os.kill(process.pid, sig)
# Run web service if invoked as a script,
# using environment or default specifications.
if __name__ == '__main__':
# Ensure installation of signal handling for SIGINT (Ctrl+C)
# (needed if launched from bash as a background process).
signal.signal(signal.SIGINT, signal.default_int_handler)
# Start service in default (blocking) mode
start_service()