-
Notifications
You must be signed in to change notification settings - Fork 0
/
qt_framework.py
executable file
·247 lines (195 loc) · 7.93 KB
/
qt_framework.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#! /usr/bin/env python
# -*- coding: utf8 -*-
#
# Description: Framework for crawling and scraping webpages with JQuery
# Author: Richard Penman (richard@sitescraper.net)
#
import sys
import os
import urllib2
from PyQt4.QtGui import QApplication, QDesktopServices
from PyQt4.QtCore import QString, QUrl, QTimer, QEventLoop, QIODevice
from PyQt4.QtWebKit import QWebView, QWebPage
from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache
from webscraping import common, pdict, settings
TOR_PROXY = QNetworkProxy(QNetworkProxy.HttpProxy, '127.0.0.1', 8118)
"""
TODO
right click find xpath:
http://doc.qt.nokia.com/4.6/webkit-domtraversal.html
http://doc.qt.nokia.com/4.6/webkit-simpleselector.html
textbox for jquery input
http://www.rkblog.rk.edu.pl/w/p/webkit-pyqt-rendering-web-pages/
threaded multiple URLs
timeout
interface with cache to expand and not use pdict
make scrape function sequential after dentist data
http://www.pyside.org/docs/pyside/PySide/QtCore/QEventLoop.html?highlight=qeventloop
add progress bar for loading page
implement watir API
"""
class NetworkAccessManager(QNetworkAccessManager):
"""Subclass QNetworkAccessManager for finer control network operations
"""
def __init__(self, proxy, allowed_extensions, cache_size=100, cache_dir='.webkit_cache'):
"""
proxy is a QNetworkProxy
allowed_extensions is a list of extensions to allow
cache_size is the maximum size of the cache (MB)
"""
QNetworkAccessManager.__init__(self)
# initialize the manager cache
cache = QNetworkDiskCache()
#QDesktopServices.storageLocation(QDesktopServices.CacheLocation)
cache.setCacheDirectory(cache_dir)
cache.setMaximumCacheSize(cache_size * 1024 * 1024) # need to convert cache value to bytes
self.setCache(cache)
# allowed content extensions
self.banned_extensions = common.MEDIA_EXTENSIONS
for ext in allowed_extensions:
if ext in self.banned_extensions:
self.banned_extensions.remove(ext)
# and proxy
if proxy:
self.setProxy(proxy)
def createRequest(self, operation, request, data):
if operation == self.GetOperation:
if self.is_forbidden(request):
# deny GET request for banned media type by setting dummy URL
request.setUrl(QUrl(QString('forbidden://localhost/')))
else:
print request.url().toString()
else:
pass
#print 'POST'
#print request.url().toString()
#data.open(QIODevice.ReadOnly)
#print data.readAll()
#print data.peek(100000000000)
#data.seek(0)
#data.close()
request.setAttribute(QNetworkRequest.CacheLoadControlAttribute, QNetworkRequest.PreferCache)
reply = QNetworkAccessManager.createRequest(self, operation, request, data)
#reply.finished.connect(self.catch_finished)
reply.error.connect(self.catch_error)
return reply
def is_forbidden(self, request):
"""Returns whether this request is permitted by checking URL extension
XXX head request for mime?
"""
return common.get_extension(str(request.url().toString())) in self.banned_extensions
def catch_error(self, eid):
if eid not in (301, ):
print 'Error:', eid, self.sender().url().toString()
class WebPage(QWebPage):
"""Override QWebPage to set User-Agent and JavaScript messages
"""
def __init__(self, user_agent):
QWebPage.__init__(self)
self.user_agent = user_agent
def userAgentForUrl(self, url):
return self.user_agent
def javaScriptAlert(self, frame, message):
"""Override default JavaScript alert popup and print results
"""
print 'Alert:', message
def javaScriptConsoleMessage(self, message, line_number, source_id):
"""Print JavaScript console messages
"""
print 'Console:', message, line_number, source_id
class JQueryBrowser(QWebView):
"""Render webpages using webkit
"""
def __init__(self, base_url=None, gui=False, user_agent=None, proxy=None, allowed_extensions=['.html', '.css', '.js'], timeout=20, cache_file=None, debug=False):
"""
base_url is the domain that will be crawled
gui is whether to show webkit window or run headless
user_agent is used to set the user-agent when downloading content
proxy is the proxy to download through
allowed_extensions are the media types to allow
timeout is the maximum amount of seconds to wait for a request
"""
self.app = QApplication(sys.argv) # must instantiate first
QWebView.__init__(self)
webpage = WebPage(user_agent or settings.user_agents[0])
manager = NetworkAccessManager(proxy, allowed_extensions)
manager.finished.connect(self.finished)
webpage.setNetworkAccessManager(manager)
self.setPage(webpage)
self.setHtml('<html><head></head><body>No content loaded</body></html>', QUrl('http://localhost'))
self.timeout = timeout
self.cache = pdict.PersistentDict(cache_file or settings.cache_file) # cache to store webpages
self.base_url = base_url
self.debug = debug
self.jquery_lib = None
QTimer.singleShot(0, self.crawl) # start crawling when all events processed
if gui: self.show()
self.app.exec_() # start GUI thread
def current_url(self):
"""Return current URL
"""
return str(self.url().toString())
def current_html(self):
"""Return current rendered HTML
"""
return unicode(self.page().mainFrame().toHtml())
def error(self):
print 'timed out'
self.start()
def get(self, url=None, script=None, key=None):
"""Load given url in webkit and return html when loaded
"""
self.base_url = self.base_url or url # set base URL if not set
html = self.cache.get(key)
if html:
if self.debug: print 'load cache', key
self.setHtml(html, QUrl(self.base_url))
elif url:
self.load(QUrl(url))
elif script:
self.js(script)
loop = QEventLoop()
timer = QTimer()
timer.setSingleShot(True)
timer.timeout.connect(loop.quit)
self.loadFinished.connect(loop.quit)
timer.start(self.timeout * 1000)
loop.exec_() # delay here until download finished or timeout
if timer.isActive():
# downloaded successfully
timer.stop()
html = self.current_html()
if key:
self.cache[key] = html
self.inject_jquery()
else:
# didn't download in time
print 'Download timeout'
html = ''
return html
def jsget(self, script, key=None):
return self.get(script=script, key=key)
def js(self, script):
"""Shortcut to execute javascript on current document
"""
self.page().mainFrame().evaluateJavaScript(script)
def inject_jquery(self):
"""Inject jquery library into this webpage for easier manipulation
"""
# XXX embed header in document, use cache
if self.jquery_lib is None:
url = 'http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js'
self.jquery_lib = urllib2.urlopen(url).read()
self.js(self.jquery_lib)
def crawl(self):
"""Override this method in subclass
"""
self.get('http://code.google.com/p/webscraping/')
self.get('http://code.google.com/p/sitescraper/')
QTimer.singleShot(5000, self.app.quit)
def finished(self, reply):
"""Override this in subclasses to process downloaded urls
"""
pass
if __name__ == '__main__':
JQueryBrowser(gui=True)