/
main.py
executable file
·236 lines (212 loc) · 9.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
from google.appengine.dist import use_library
use_library('django', '1.2')
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util, template
from google.appengine.api import taskqueue
from google.appengine.api.urlfetch import fetch
from google.appengine.api import memcache
import logging
import zlib
import magic
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from StringIO import StringIO
import string
import re
import difflib
import model
import os
def queue_fetch(doc):
"""Added document to the queue for later fetching."""
logging.info('Queuing document %s' % doc.url_hash)
taskqueue.add(url='/tasks/fetchdoc', params={'url_hash' : doc.url_hash},
method='GET')
class MainHandler(webapp.RequestHandler):
def get(self):
path = os.path.join(os.path.dirname(__file__), 'templates/main.html')
self.response.out.write(template.render(path, {}))
def post(self):
"""Handle a new registration."""
url = self.request.get('url')
logging.info('Adding %s url' %url)
url_hash = model.gethash(url)
# Add new entry to the data
doc = model.Document(url=url, url_hash=url_hash)
doc.put()
path = os.path.join(os.path.dirname(__file__), 'templates/newurl.html')
self.response.out.write(template.render(path, {'url' : 'count?doc=%s' % url_hash}))
# Add a task to fetch this document immediately.
queue_fetch(doc)
class HourlyFetchHandler(webapp.RequestHandler):
def get(self):
logging.info('Hourly fetch')
for doc in model.Document.all():
queue_fetch(doc)
def get_pdf_text(content):
"""Extract the text of a pdf file."""
outfp = StringIO()
password = ''
laparams = LAParams()
codec = 'utf-8'
pagenos = set()
maxpages = 0
rsrcmgr = PDFResourceManager(caching=False)
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
content = StringIO(content)
process_pdf(rsrcmgr, device, content, pagenos, maxpages=maxpages, password=password, check_extractable=True,
caching=False)
output = outfp.getvalue()
# Try and free up memory
outfp.close()
content.close()
return output
def get_text(content):
"""Extract text from a file (currently handles PDFs and plain text)."""
type = magic.whatis(content)
logging.info('Magic type %s' % type)
if type == 'PDF document':
text = get_pdf_text(content)
else: # Assume its text
text = content
# Clean up the text as bit
text = text.strip()
text = string.join(re.split('\W+', text))
return text
def get_word_stats(newtext, oldtext=None):
"""Calculate the absolute number of words in the next text and all the change from the old text.
The text has already been cleaned up a bit (see get_text)."""
# First the easy bit - absolute word count.
abs_wc = len(string.split(newtext))
change_wc = 0
if oldtext != None:
# Need to calculate how much the text has changed.
# First split everything into lists
newtext = string.split(newtext)
oldtext = string.split(oldtext)
last_change = None
for c in difflib.Differ().compare(newtext, oldtext):
linetype = c[0:2]
if linetype == '+ ' or linetype == '- ':
# This line marks a change (unless it is similar to a proceeding line in which case it
# may not mark a change.
thisline = c[2:]
if last_change == None or difflib.get_close_matches(thisline, [last_change]) == []:
# This is a new change
last_change = thisline
change_wc += 1
else: # Don't double count word changes
last_change = None
return abs_wc, change_wc
def update_doc_stats(doc):
"""Get the absolute and changed words for the new document."""
content = fetch(doc.url).content
content_hash = model.gethash(content)
logging.info('Acquired document hash:%s' % content_hash)
# Check if the document has changed
if content_hash == doc.last_version_hash:
# No change to the document. Copy the old entry
logging.info('No change to document')
oldrec = model.WordRecord.all().filter('doc =', doc).order('-timestamp').get()
# Save a new copy
model.WordRecord(doc=doc, abs_wordcount=oldrec.abs_wordcount, change_wordcount=0).put()
else:
# The document has changed.
# We need to run a comparision
# Decompress old version
if doc.last_version:
last_version = zlib.decompress(doc.last_version)
else:
last_version = None
# Extract text out of new version
logging.info('Document change. Calculating new stats')
current_version = get_text(content)
abs_wc, change = get_word_stats(current_version, last_version)
# Add record
model.WordRecord(doc=doc, abs_wordcount=abs_wc, change_wordcount=change).put()
# Update our version of the document.
doc.last_version_hash = content_hash; doc.last_version = zlib.compress(current_version)
doc.put()
# Junk the cache for this doucment
RenderCount(doc.url_hash).delcache()
logging.info('Updated stats')
class FetchDocHandler(webapp.RequestHandler):
def get(self):
url_hash = self.request.get('url_hash')
logging.info('Fetching %s' % url_hash)
doc = model.Document.all().filter('url_hash =', url_hash).get()
assert(doc)
logging.info(doc)
update_doc_stats(doc)
class CachedPageRender(object):
"""Base class for rendering pages and caching the output in memcache.
Subclasses need to set self.key during __init__."""
def render(self, handler):
"""Return render page as a string."""
logging.info('Rendering %s', self.key)
rendering = memcache.get(self.key)
if rendering == None:
logging.info('Cache miss for %s' % self.key)
rendering = self.uncached_render(handler)
memcache.set(self.key, rendering, time=24*60*60)
else:
logging.info('Cache hit for %s' % self.key)
return rendering
def delcache(self):
memcache.delete(self.key)
logging.info('Junking cache %s' % self.key)
def create_versioned_key(self, base_key):
"""Create a key using base_key string and the name of the subclass along with the
version of this deployment (ensuring cached is emptied on new deployments."""
version = os.environ['CURRENT_VERSION_ID']
self.key = 'CachedPageRender_v' + version + str(self.__class__) + '_' + base_key
logging.info('Versioned key %s' % self.key)
class RenderCount(CachedPageRender):
"""Deal with rendering the webpage showing the statistics of a document.
Additionally deals with caching."""
def __init__(self, url_hash):
self.create_versioned_key(url_hash)
self.url_hash = url_hash
def uncached_render(self, handler):
logging.info('Rendering count doc %s' % self.url_hash)
doc = model.Document.all().filter('url_hash =', self.url_hash).get()
assert(doc)
# Find the corresponding data entries for the document.
record_list = model.WordRecord.all().filter('doc =', doc)
path = os.path.join(os.path.dirname(__file__), 'templates/display.html')
return template.render(path, {'record_list' : record_list})
class DisplayHandler(webapp.RequestHandler):
def get(self):
url_hash = url_hash = self.request.get('doc')
self.response.headers["Cache-Control"] = 'public; max-age=1000'
self.response.out.write(RenderCount(url_hash).render(self))
def main():
application = webapp.WSGIApplication([('/', MainHandler),
('/tasks/hourlyfetch', HourlyFetchHandler),
('/tasks/fetchdoc', FetchDocHandler),
('/count.*', DisplayHandler)],
debug=True)
util.run_wsgi_app(application)
if __name__ == '__main__':
main()