-
Notifications
You must be signed in to change notification settings - Fork 0
/
storage.py
396 lines (336 loc) · 14.8 KB
/
storage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
from twisted.internet import reactor, protocol, defer
from twisted.enterprise import adbapi
from twisted.protocols.memcache import MemCacheProtocol, DEFAULT_PORT
from twisted.python import log
import traceback, urllib, time
import cache, http, mail, mc
class DataStore:
prefetch_dependent_elements = ['favorite', 'subscription', 'unread']
prefetch_types = ['session']
# Status codes
uncacheable_status = [500, 502, 503, 504, 304, 307]
uncacheable_methods = ['POST', 'PUT', 'DELETE']
short_status = [404]
def __init__(self, config):
self.config = config
# Memcache Backend
servers = config.get('backend_memcache').split(',')
log.msg('Creating connections to backend_memcache servers %s...' % ','.join(servers))
try:
self.proto = mc.Mc(servers, pool_size=5)
log.msg('backend_memcache OK')
except:
log.msg('ERROR: Failed to connect to backend_memcache')
log.msg(traceback.format_exc())
# Database Backend
try:
self.db = adbapi.ConnectionPool("pyPgSQL.PgSQL",
database=config['backend_dbname'],
host=config['backend_dbhost'],
user=config['backend_dbuser'],
password=config['backend_dbpass'],
cp_noisy=True,
cp_reconnect=True,
cp_min=5,
cp_max=20,
)
log.msg("Connected to db.")
except ImportError:
mail.error("Could not import PyPgSQL!\n%s" % traceback.format_exc())
except:
mail.error("Unable to connect to backend database.\n%s" % traceback.format_exc())
# HTTP Backend
try:
self.backend_host, self.backend_port = self.config['backend_webserver'].split(':')
self.backend_port = int(self.backend_port)
except:
self.backend_host = self.config['backend_webserver']
self.backend_port = 80
# Cache Backend
log.msg('Initializing cache...')
cache_type = config['cache_type'].capitalize() + 'Cache'
self.cache = getattr(cache, cache_type)(config)
# Memorize variants of a uri
self.uri_lookup = {}
# Request pileup queue
self.pending_requests = {}
# Init status
def dbConnected(self, db):
log.msg('Database connection success.')
self.db = db
# Main methods
def get(self, keys, request, force=False):
if not isinstance(keys, list): keys = [keys]
if force:
d = self.handleMisses(dict(zip(keys, [None for key in keys])), request)
else:
d = defer.maybeDeferred(self.cache.get, keys)
d.addCallback(self.handleMisses, request)
d.addErrback(self.getError)
return d
def delete(self, keys):
if not isinstance(keys, list): keys = [keys]
self.cache.delete(keys)
def flush(self):
"Flush entire cache"
self.cache.flush()
def handleMisses(self, dictionary, request):
"Process hits, check for validity, and fetch misses / invalids"
missing_deferreds = []
missing_elements = []
present_elements = []
for key, value in dictionary.items():
fetch = False
if value is None:
#log.msg('MISS [%s]' % key)
fetch = True
elif not getattr(self, 'valid_' + self.elementType(key))(request, self.elementId(key), value):
log.msg('INVALID [%s]' % key)
fetch = True
else:
pass
#log.msg('HIT [%s]' % key)
if fetch:
d = defer.maybeDeferred(getattr(self, 'fetch_' + self.elementType(key)), request, self.elementId(key))
d.addErrback(self.fetchError, key)
missing_deferreds.append(d)
missing_elements.append(key)
else:
present_elements.append(key)
if present_elements:
log.msg('HIT %s' % ', '.join(present_elements))
# Wait for all items to be fetched
if missing_deferreds:
log.msg('MISS %s' % ', '.join(missing_elements))
deferredList = defer.DeferredList(missing_deferreds)
deferredList.addCallback(self.returnElements, dictionary, missing_elements)
return deferredList
else:
return defer.succeed(dictionary)
def returnElements(self, results, dictionary, missing_elements):
if not isinstance(results, list): results = [results]
uncached_elements = dict([(key, results.pop(0)[1]) for key in missing_elements])
dictionary.update(uncached_elements)
return dictionary
def fetchError(self, result, key):
log.msg('Error calling fetch_%s for key %s' % (self.elementType(key), self.elementId(key)))
log.msg(result.getErrorMessage().replace('\n', ' '))
return {}
def getError(self, dictionary):
log.msg('uh oh! %s' % dictionary)
traceback.print_exc()
# Hashing
def elementHash(self, request, element_type, element_id = None):
"Hash function for elements"
return getattr(self, 'hash_' + element_type.lower())(request, element_id)
# elementType's are not allowed to have _ in them because we use those in composing ids
def elementType(self, key):
return key.split('_')[0]
def elementId(self, key):
return '_'.join(key.split('_')[1:])
# Expirations
def hash_expiration(self, request, id):
return 'expiration_' + request.uri.rstrip("?")
def fetch_expiration(self, request, id):
return False
def valid_expiration(self, request, id, value):
return True
# Page
def hash_page(self, request, id=None, cookies = []):
# Hash the request key
key = 'page_' + (request.getHeader('x-real-host') or request.getHeader('host') or '') + request.uri.split('#')[0].rstrip("?")
# Internationalization salt
if self.config.get('hash_lang_header'):
header = request.getHeader('accept-language') or self.config.get('hash_lang_default', 'en-us')
if header:
try:
lang = header.replace(' ', '').split(';')[0].split(',')[0].lower()
#log.msg('lang: %s' % lang)
key += '//' + lang
except:
traceback.print_exc()
if cookies:
# Grab the cookies we care about from the request
found_cookies = []
for cookie in cookies:
val = request.getCookie(cookie)
if val:
found_cookies.append('%s=%s' % (cookie, val))
# Update key based on cookies we care about
if found_cookies:
key += '//' + ','.join(found_cookies)
return key
def fetch_page(self, request, id, ignoreResult=False):
# Prevent idental request pileup
key = self.hash_page(request)
if key in self.pending_requests and ignoreResult:
log.msg('PENDING: Request is already pending for %s' % request.uri)
return True
# Tell backend that we are Twice and strip cache-control headers
request.setHeader(self.config.get('twice_header'), 'true')
request.removeHeader('cache-control')
# Make the request
sender = http.HTTPRequestSender(request)
sender.noisy = False
reactor.connectTCP(self.backend_host, self.backend_port, sender)
# Defer the result
d = sender.deferred.addCallback(self.extract_page, request).addErrback(self.page_failed, request)
self.pending_requests[key] = d
return d
def valid_page(self, request, id, value):
"Determine whether the page can be served from the cache"
now = time.time()
# Do not serve cached versions of pages if the method is not cacheable
if request.method.upper() in self.uncacheable_methods:
log.msg('PASS-THROUGH [%s]' % request.method.upper())
return False
# Force refetch of very stale (3x cache_control value) pages
elif now > value['rendered_on'] + value['cache_control'] * 3:
log.msg('STALE-HARD [%s]' % id)
return False
# Sevre semi-stale pages but refresh in the background
elif now > value['rendered_on'] + value['cache_control']:
log.msg('STALE-SOFT [%s]' % id)
# Extend the valid cache length by 30s so we can fetch it
response = value['response']
cookies = sorted((response.getHeader(self.config.get('cookies_header')) or '').split(','))
key = self.hash_page(request, cookies = cookies)
cache_control = response.getCacheControlHeader(self.config.get('cache_header')) or 0
if response.status in self.uncacheable_status:
log.msg('NO-CACHE (Status is %s) [%s]' % (response.status, key))
cache = False
elif response.status in self.short_status:
log.msg('SHORT-CACHE (Status is %s) [%s]' % (response.status, key))
cache = True
cache_control = 30
elif cache_control and cache_control > 0:
log.msg('CACHE [%s] (for %ss)' % (key, cache_control))
cache = True
else:
log.msg('NO-CACHE (No cache data) [%s]' % key)
cache = False
value['rendered_on'] += 30
if cache:
self.cache.set({key : value}, 60) # Give 60s to refresh the page
# Now fetch a fresh copy in the background
self.fetch_page(request, id, ignoreResult=True)
return True
# Valid page
else:
return True
def page_failed(self, response, request):
# Release pending lock
key = self.hash_page(request)
if key in self.pending_requests:
del self.pending_requests[key]
log.msg('ERROR: Could not retrieve [%s]' % request.uri.rstrip("?h"))
response.printBriefTraceback()
# TODO: Return something meaningful!
return ''
def extract_page(self, response, request):
# Release pending lock
key = self.hash_page(request)
if key in self.pending_requests:
del self.pending_requests[key]
# Extract uniqueness info
cookies = sorted((response.getHeader(self.config.get('cookies_header')) or '').split(','))
key = self.hash_page(request, cookies = cookies)
# Store uri variant
if key not in self.uri_lookup.setdefault(request.uri.rstrip("?"), []):
log.msg('Added new variant for %s: %s' % (request.uri.rstrip("?"), key))
self.uri_lookup[request.uri.rstrip("?")].append(key)
# Override for non GET's
if request.method.upper() in self.uncacheable_methods:
log.msg('NO-CACHE (Method is %s) [%s]' % (request.method, key))
cache = False
cache_control = 0
else:
# Cache logic
cache_control = response.getCacheControlHeader(self.config.get('cache_header')) or 0
if response.status in self.uncacheable_status:
log.msg('NO-CACHE (Status is %s) [%s]' % (response.status, key))
cache = False
elif cache_control and cache_control > 0:
log.msg('CACHE [%s] (for %ss)' % (key, cache_control))
cache = True
elif response.status in self.short_status:
log.msg('SHORT-CACHE (Status is %s) [%s]' % (response.status, key))
cache = True
cache_control = 30
else:
log.msg('NO-CACHE (No cache data) [%s]' % key)
cache = False
# Actual return value
value = {
'dependencies' : [],
'response' : response,
'rendered_on' : time.time(),
'cache_control' : cache_control
}
if cache:
response.cookies = []
self.cache.set({key : value}, cache_control + 86400) # Cache for an extra day
return value
# Memcache
def hash_memcache(self, request, id):
return 'memcache_' + id
def fetch_memcache(self, request, id):
#log.msg('Looking up memcache %s' % id)
return self.proto.get(id).addCallback(self.extract_memcache, request, id)
def extract_memcache(self, result, request, id):
# Un-comment for the twisted memcached library
#value = result and result[1]
value = result
key = self.hash_memcache(request, id)
self.cache.set({key: value}, 30) # 30 seconds
return value
def valid_memcache(self, request, id, value):
return True
def incr_memcache(self, key):
#log.msg('Incrementing memcache %s' % key)
return self.proto.increment(key)
def decr_memcache(self, key):
#log.msg('Decrementing memcache %s' % key)
return self.proto.decrement(key)
def delete_memcache(self, key):
self.cache.delete(key)
return self.proto.delete(key)
def set_memcache(self, key, val):
#log.msg('Setting memcache %s' % key)
return self.proto.set(key, val)
# geo location
def hash_geo(self, request, id):
return 'ip'
# ip address
def hash_ip(self, request, id):
return 'ip'
# User session
def hash_session(self, request, id):
id = self._read_session(request)
if id:
return 'session_' + id
else:
return ''
def fetch_session(self, request, id):
id = self._read_session(request)
return self.db.runInteraction(self._session, id).addCallback(self.extract_session, request, id)
def extract_session(self, result, request, id):
if len(result):
session = dict(zip(result[0].keys(), result[0].values()))
output = session
else:
output = {}
key = self.hash_session(request, id)
self.cache.set({key : output}, 86400) # 24 hours
return output
def valid_session(self, request, id, value):
return True
def _read_session(self, request):
return urllib.unquote(request.getCookie(self.config['session_cookie']) or '')
def _session(self, txn, id):
log.msg('Looking up session %s' % id)
users_query = "select users.* from users where persistent_cookie = '%s'" % id
#log.msg('Running query: %s' % users_query)
txn.execute(users_query)
users_result = txn.fetchall()
return users_result