class CSConverter(QObject): _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super(CSConverter, cls).__new__(cls, *args, **kwargs) return cls._instance def __init__(self): super(CSConverter, self).__init__(QApplication.instance()) self.m_webPage = QWebPage(self) with QPyFile(':/resources/coffee-script.js') as f: script = f.readAll() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject( 'converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('''try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; } ''') return result
class CSConverter(QObject): def __init__(self, parent=None): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(':/resources/coffee-script.js') if not converter.open(QFile.ReadOnly): sys.exit('CoffeeScript compiler is not available!') script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject('converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('''try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; }''') if result[0] is False: qWarning(result[1]) return '' return result[1]
class CSConverter(QObject): _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super(CSConverter, cls).__new__(cls, *args, **kwargs) return cls._instance def __init__(self): QObject.__init__(self, QApplication.instance()) self.m_webPage = QWebPage(self) converter = QFile(':/resources/coffee-script.js') if not converter.open(QFile.ReadOnly): sys.exit('CoffeeScript compiler is not available!') script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject('converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('''try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; } ''') return result
class CSConverter(QObject): def __init__(self, parent=None): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(":/resources/coffee-script.js") if not converter.open(QFile.ReadOnly): sys.exit("CoffeeScript compiler is not available!") script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject("converter", self) def convert(self, script): self.setProperty("source", script) result = self.m_webPage.mainFrame().evaluateJavaScript( """try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; } """ ) return result
class CSConverter(QObject): _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super(CSConverter, cls).__new__(cls, *args, **kwargs) return cls._instance def __init__(self): super(CSConverter, self).__init__(QApplication.instance()) self.m_webPage = QWebPage(self) with QPyFile(':/resources/coffee-script.js') as f: self.m_webPage.mainFrame().evaluateJavaScript(f.readAll()) self.m_webPage.mainFrame().addToJavaScriptWindowObject('converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('''try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; } ''') return result
class CSConverter(QObject): def __init__(self, parent): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(":/resources/coffee-script.js") if not converter.open(QFile.ReadOnly): sys.exit("CoffeeScript compiler is not available!") script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject("converter", self) def convert(self, script): self.setProperty("source", script) result = self.m_webPage.mainFrame().evaluateJavaScript( """try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; } """ ) return result @staticmethod def instance(): global CSConverterInstance # We need only one instance of the CSConverter for our whole life if CSConverterInstance is None: CSConverterInstance = CSConverter(QCoreApplication.instance()) return CSConverterInstance
def importBookmarks(self): """ Public method to import bookmarks. """ supportedFormats = QStringList() \ << self.trUtf8("XBEL bookmarks").append(" (*.xbel *.xml)") \ << self.trUtf8("HTML Netscape bookmarks").append(" (*.html *.htm)") fileName = KQFileDialog.getOpenFileName(\ None, self.trUtf8("Import Bookmarks"), QString(), supportedFormats.join(";;"), None) if fileName.isEmpty(): return reader = XbelReader() importRootNode = None if fileName.endsWith(".html"): inFile = QFile(fileName) inFile.open(QIODevice.ReadOnly) if inFile.openMode == QIODevice.NotOpen: KQMessageBox.warning(None, self.trUtf8("Import Bookmarks"), self.trUtf8("""Error opening bookmarks file <b>%1</b>.""")\ .arg(fileName)) return webpage = QWebPage() webpage.mainFrame().setHtml(QString(inFile.readAll())) result = webpage.mainFrame().evaluateJavaScript(extract_js).toByteArray() buffer_ = QBuffer(result) buffer_.open(QIODevice.ReadOnly) importRootNode = reader.read(buffer_) else: importRootNode = reader.read(fileName) if reader.error() != QXmlStreamReader.NoError: KQMessageBox.warning(None, self.trUtf8("Import Bookmarks"), self.trUtf8("""Error when importing bookmarks on line %1, column %2:\n""" """%3""")\ .arg(reader.lineNumber())\ .arg(reader.columnNumber())\ .arg(reader.errorString())) return importRootNode.setType(BookmarkNode.Folder) importRootNode.title = self.trUtf8("Imported %1")\ .arg(QDate.currentDate().toString(Qt.SystemLocaleShortDate)) self.addBookmark(self.menu(), importRootNode)
class html2png(): # get URL and pixel width as parameters. The width is only approximate def __init__(self, source, target, width): self.width = int(width) self.target = target #self.app = QApplication(sys.argv) signal.signal(signal.SIGINT, signal.SIG_DFL) self.qwPage = QWebPage() size = QSize() size.setWidth(int(width)) self.qwPage.setViewportSize(size) self.qwPage.connect(self.qwPage, SIGNAL("loadFinished(bool)"), self.onLoadFinished) self.qwPage.mainFrame().load(QUrl(source)) # do not call this function. it is called via a signal def onLoadFinished(self, result): if not result: sys.exit(1) # Set the size of the (virtual) browser window self.qwPage.setViewportSize(self.qwPage.mainFrame().contentsSize()) # Paint this frame into an image image = QImage(self.qwPage.viewportSize(), QImage.Format_RGB32) painter = QPainter(image) self.qwPage.mainFrame().render(painter) painter.end() targetHeight = self.width * 1.4142 numSplits = math.ceil(image.height() / targetHeight) for x in range(0, numSplits): start = (x) * targetHeight copy = image.copy(0, int(start), image.width(), targetHeight - 1) self.saveOptPNG(copy, self.target[:-4] + "." + str(x) + ".png") sys.exit(0) #optimize QImage PNG with PIL and save def saveOptPNG(self, img, path): buffer = QBuffer() buffer.open(QIODevice.ReadWrite) img.save(buffer, "PNG") strio = BytesIO() strio.write(buffer.data()) buffer.close() strio.seek(0) pil_im = Image.open(strio) pil_im.save(path, "PNG", optimize=False, compress_level=9)
def render_html(path_to_html, width=590, height=750, as_xhtml=True): from PyQt4.QtWebKit import QWebPage from PyQt4.Qt import QEventLoop, QPalette, Qt, QUrl, QSize from calibre.gui2 import is_ok_to_use_qt if not is_ok_to_use_qt(): return None path_to_html = os.path.abspath(path_to_html) with CurrentDir(os.path.dirname(path_to_html)): page = QWebPage() pal = page.palette() pal.setBrush(QPalette.Background, Qt.white) page.setPalette(pal) page.setViewportSize(QSize(width, height)) page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) loop = QEventLoop() renderer = HTMLRenderer(page, loop) page.loadFinished.connect(renderer, type=Qt.QueuedConnection) if as_xhtml: page.mainFrame().setContent(open(path_to_html, 'rb').read(), 'application/xhtml+xml', QUrl.fromLocalFile(path_to_html)) else: page.mainFrame().load(QUrl.fromLocalFile(path_to_html)) loop.exec_() renderer.loop = renderer.page = None page.loadFinished.disconnect() del page del loop if isinstance(renderer.exception, ParserError) and as_xhtml: return render_html(path_to_html, width=width, height=height, as_xhtml=False) return renderer
def render_html(path_to_html, width=590, height=750, as_xhtml=True): from PyQt4.QtWebKit import QWebPage from PyQt4.Qt import QEventLoop, QPalette, Qt, QUrl, QSize from calibre.gui2 import is_ok_to_use_qt if not is_ok_to_use_qt(): return None path_to_html = os.path.abspath(path_to_html) with CurrentDir(os.path.dirname(path_to_html)): page = QWebPage() pal = page.palette() pal.setBrush(QPalette.Background, Qt.white) page.setPalette(pal) page.setViewportSize(QSize(width, height)) page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) loop = QEventLoop() renderer = HTMLRenderer(page, loop) page.loadFinished.connect(renderer, type=Qt.QueuedConnection) if as_xhtml: page.mainFrame().setContent(open(path_to_html, 'rb').read(), 'application/xhtml+xml', QUrl.fromLocalFile(path_to_html)) else: page.mainFrame().load(QUrl.fromLocalFile(path_to_html)) loop.exec_() renderer.loop = renderer.page = None page.loadFinished.disconnect() del page del loop if isinstance(renderer.exception, ParserError) and as_xhtml: return render_html(path_to_html, width=width, height=height, as_xhtml=False) return renderer
def start(self): ''' Starts the conversion process. ''' def myLoadProgress(progress): self._loadProgress = progress def myLoadFinished(isLoaded): self._loaded = True webpage = QWebPage() webpage.loadProgress.connect(myLoadProgress) webpage.loadFinished.connect(myLoadFinished) url = misc.temp_path('import') baseUrl = QUrl.fromLocalFile(url) webpage.mainFrame().setHtml( self._document.getMainPage(mathOutput='svg'), baseUrl) while not self._loaded and not self._canceled: qApp.processEvents() self.ui.label.setText('Typesetting math equations...') if not self._canceled: # Wait for the MathJax to typeset while not self._mathTypeset and not self._canceled: qApp.processEvents() progress = int(webpage.mainFrame().evaluateJavaScript( misc.js_command('GetMathTypesetProgress', [])).toInt()[0]) self.ui.progressBar.setValue(progress) self._mathTypeset = webpage.mainFrame().evaluateJavaScript( misc.js_command('IsMathTypeset', [])).toBool() # If I haven't canceled yet, let's convert the document in a # separate thread if not self._canceled: self._thread = ExportToHtmlThread( unicode(webpage.mainFrame().evaluateJavaScript( misc.js_command('GetBodyHTML', [])).toString()), self._configuration, self._assigner, self._filePath) self._thread.onProgress.connect(self.ui.progressBar.setValue) self._thread.onProgressLabel.connect(self.ui.label.setText) self._thread.finished.connect(self._threadFinished) self.ui.cancelButton.clicked.connect(self._thread.quit) self._thread.start()
class CSConverter(QObject): def __init__(self, parent=None): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(':/resources/coffee-script.js') if not converter.open(QFile.ReadOnly): sys.exit('CoffeeScript compiler is not available!') script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject('converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('this.CoffeeScript.compile(converter.source)') return result if result else ''
class CSConverter(QObject): def __init__(self, parent=None): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(":/resources/coffee-script.js") converter.open(QFile.ReadOnly) script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject("converter", self) def convert(self, script): self.setProperty("source", script) result = self.m_webPage.mainFrame().evaluateJavaScript("this.CoffeeScript.compile(converter.source)") if len(result): return result return ""
def __init__(self, parent, jsonFile): QObject.__init__(self, parent) with codecs.open(jsonFile, encoding='utf-8') as fd: json = fd.read() self.settings = { 'auth': { 'mapping': 'auth', 'default': None }, 'cookies': { 'mapping': 'cookies', 'default': None }, 'diskCache': { 'mapping': 'disk_cache', 'default': False }, 'ignoreSslErrors': { 'mapping': 'ignore_ssl_errors', 'default': False }, 'loadImages': { 'mapping': 'load_images', 'default': True }, 'loadPlugins': { 'mapping': 'load_plugins', 'default': False }, 'localAccessRemote': { 'mapping': 'local_access_remote', 'default': False }, 'outputEncoding': { 'mapping': 'output_encoding', 'default': 'System' }, 'proxy': { 'mapping': 'proxy', 'default': None }, 'scriptEncoding': { 'mapping': 'script_encoding', 'default': 'utf-8' }, 'verbose': { 'mapping': 'verbose', 'default': False } } # generate dynamic properties for setting in self.settings: self.setProperty(setting, self.settings[setting]['default']) # now it's time to parse our JSON file if not json.lstrip().startswith('{') or not json.rstrip().endswith('}'): qWarning('Config file MUST be in JSON format!') return file_ = QFile(':/configurator.js') if not file_.open(QFile.ReadOnly): sys.exit('Unable to load JSON configurator!') configurator = str(file_.readAll()) file_.close() if not configurator: sys.exit('Unable to set-up JSON configurator!') webPage = QWebPage(self) # add config object webPage.mainFrame().addToJavaScriptWindowObject('config', self) # apply settings webPage.mainFrame().evaluateJavaScript(configurator.replace('%1', json))
class CSConverter(QObject): def __init__(self, parent = None): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(':/resources/coffee-script.js') converter.open(QFile.ReadOnly) script = QString.fromUtf8(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject('converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('this.CoffeeScript.compile(converter.source)') if result.type() == QVariant.String: return result.toString() return QString()
def documentation_load_finished(self, ok): url = self.ui.webView.url() url_path = str(url.path()) url_fragment = str(url.fragment()).replace('library.zip.appfuncs.', 'appfuncs.') #ExeStdOut(url_path, url_fragment) #print ("url", url_path) #print ("fragment", url_fragment) if url_path.endswith("docs/appfuncs.html"): #print (self.controller.function_docs, os.path.isfile(self.controller.function_docs)) #with open(self.controller.function_docs) as fid: # doc_html = str(fid.read(), 'utf-8') doc_html = self.html_source() if url_fragment.endswith(".run"): url_fragment = url_fragment[:-4] start_tag = '<dt id="%s">' % url_fragment end_tag = '<dt id=' scriptFunction_name = url_fragment.split(".")[-2] try: method_html = start_tag + doc_html.split(start_tag)[1].split(end_tag)[0] for a, b in [('<tt class="descname">run</tt>', '<tt class="descname">%s</tt>' % scriptFunction_name), ('href="_modules', 'href="source.html?#_modules'), #('href="#appfuncs', 'href="doc.html?#appfuncs'), #('</th>', '</th></tr>\n<tr>'), ("Parameters :", "Parameters:"), ("Return :", "Return:"), ]: method_html = method_html.replace(a, b) except IndexError: method_html = "Documentation not found" self.doc_contents = method_html elif url_path.endswith("docs/source.html"): source_html_path, self.anchor = str(url.fragment()).split("#") with open("%s/%s" % (os.path.dirname(self.controller.function_docs), source_html_path)) as fid: source_html = to_str(fid.read()) source_html = source_html.replace('<a class="viewcode-back" href="../../../ScriptFunctions.html', '<a class="viewcode-back" href="appfuncs.html') from PyQt4.QtWebKit import QWebPage p = QWebPage() f = p.mainFrame() f.setHtml(source_html) self.doc_contents = f.findFirstElement('div.body').toInnerXml() if self.anchor: self.ui.webView.page().mainFrame().scrollToAnchor(self.anchor) with open(self.doc_path + "tmp.html", 'w') as fid: fid.write(self.html_source())
def __init__(self, parent, jsonFile): super(Config, self).__init__(parent) with codecs.open(jsonFile, encoding='utf-8') as fd: json = fd.read() self.settings = { 'cookiesFile': { 'mapping': 'cookies_file', 'default': None }, 'debug': { 'mapping': 'debug', 'default': None }, 'diskCache': { 'mapping': 'disk_cache', 'default': False }, 'ignoreSslErrors': { 'mapping': 'ignore_ssl_errors', 'default': False }, 'loadImages': { 'mapping': 'load_images', 'default': True }, 'loadPlugins': { 'mapping': 'load_plugins', 'default': False }, 'localToRemoteUrlAccessEnabled': { 'mapping': 'local_to_remote_url_access', 'default': False }, 'maxDiskCacheSize': { 'mapping': 'max_disk_cache_size', 'default': -1 }, 'outputEncoding': { 'mapping': 'output_encoding', 'default': 'System' }, 'proxy': { 'mapping': 'proxy', 'default': None }, 'scriptEncoding': { 'mapping': 'script_encoding', 'default': 'utf-8' }, 'verbose': { 'mapping': 'verbose', 'default': False } } do_action('ConfigInit', self.settings) # generate dynamic properties for setting in self.settings: self.setProperty(setting, self.settings[setting]['default']) # now it's time to parse our JSON file if not json.lstrip().startswith('{') or not json.rstrip().endswith('}'): qWarning('Config file MUST be in JSON format!') return with QPyFile(':/configurator.js') as f: configurator = f.readAll() webPage = QWebPage(self) # add config object webPage.mainFrame().addToJavaScriptWindowObject('config', self) # apply settings webPage.mainFrame().evaluateJavaScript(configurator.replace('%1', json))
class Thumbnailer(QObject): finished = pyqtSignal() def __init__(self, parent=None): QObject.__init__(self, parent) self.webPage = QWebPage(self) self.mainFrame = self.webPage.mainFrame() self.webPage.loadFinished.connect(self.render) def load(self, url): qurl = QUrl(url) self.webPage.mainFrame().load(qurl) def render(self): self.webPage.setViewportSize(self.webPage.mainFrame().contentsSize()) image = QImage(self.webPage.viewportSize(), QImage.Format_ARGB32) painter = QPainter() painter.begin(image) self.webPage.mainFrame().render(painter) painter.end() image.save('thumbnail.png') self.finished.emit()
def formatQAAsImage(html, type, cid, mid, fact, tags, cm, deck): # build up the html div = '''<div class="card%s" id="cm%s%s">%s</div>''' % ( type[0], type[0], hexifyID(cm.id), html) attr = type + 'Align' if getattr(cm, attr) == 0: align = "center" elif getattr(cm, attr) == 1: align = "left" else: align = "right" html = (("<center><table width=95%%><tr><td align=%s>" % align) + div + "</td></tr></table></center>") t = "<body><br><center>%s</center></body>" % (html) bg = "body { background-color: #fff; }\n" html = "<style>\n" + bg + deck.rebuildCSS() + "</style>\n" + t # create the web page object page = QWebPage() page.mainFrame().setHtml(html) # size everything all nice page = fitContentsInPage(page) image= QImage(page.viewportSize(), QImage.Format_ARGB32_Premultiplied) painter = QPainter(image) page.mainFrame().render(painter) painter.end() path = saveImage(image, deck) link = u"<img src=\"%s\">" % ( path ) #print link #print html return link
class HTMLTableRenderer(QObject): def __init__(self, html, base_dir, width, height, dpi, factor): ''' `width, height`: page width and height in pixels `base_dir`: The directory in which the HTML file that contains the table resides ''' QObject.__init__(self) self.app = None self.width, self.height, self.dpi = width, height, dpi self.base_dir = base_dir self.images = [] self.tdir = tempfile.mkdtemp(prefix='calibre_render_table') self.loop = QEventLoop() self.page = QWebPage() self.page.loadFinished.connect(self.render_html) self.page.mainFrame().setTextSizeMultiplier(factor) self.page.mainFrame().setHtml( html, QUrl('file:' + os.path.abspath(self.base_dir))) def render_html(self, ok): try: if not ok: return cwidth, cheight = self.page.mainFrame().contentsSize().width( ), self.page.mainFrame().contentsSize().height() self.page.setViewportSize(QSize(cwidth, cheight)) factor = float(self.width) / cwidth if cwidth > self.width else 1 cutoff_height = int(self.height / factor) - 3 image = QImage(self.page.viewportSize(), QImage.Format_ARGB32) image.setDotsPerMeterX(self.dpi * (100 / 2.54)) image.setDotsPerMeterY(self.dpi * (100 / 2.54)) painter = QPainter(image) self.page.mainFrame().render(painter) painter.end() cheight = image.height() cwidth = image.width() pos = 0 while pos < cheight: img = image.copy(0, pos, cwidth, min(cheight - pos, cutoff_height)) pos += cutoff_height - 20 if cwidth > self.width: img = img.scaledToWidth(self.width, Qt.SmoothTransform) f = os.path.join(self.tdir, '%d.png' % pos) img.save(f) self.images.append((f, img.width(), img.height())) finally: QApplication.quit()
class HTMLTableRenderer(QObject): def __init__(self, html, base_dir, width, height, dpi, factor): ''' `width, height`: page width and height in pixels `base_dir`: The directory in which the HTML file that contains the table resides ''' QObject.__init__(self) self.app = None self.width, self.height, self.dpi = width, height, dpi self.base_dir = base_dir self.images = [] self.tdir = tempfile.mkdtemp(prefix='calibre_render_table') self.loop = QEventLoop() self.page = QWebPage() self.connect(self.page, SIGNAL('loadFinished(bool)'), self.render_html) self.page.mainFrame().setTextSizeMultiplier(factor) self.page.mainFrame().setHtml(html, QUrl('file:'+os.path.abspath(self.base_dir))) def render_html(self, ok): try: if not ok: return cwidth, cheight = self.page.mainFrame().contentsSize().width(), self.page.mainFrame().contentsSize().height() self.page.setViewportSize(QSize(cwidth, cheight)) factor = float(self.width)/cwidth if cwidth > self.width else 1 cutoff_height = int(self.height/factor)-3 image = QImage(self.page.viewportSize(), QImage.Format_ARGB32) image.setDotsPerMeterX(self.dpi*(100/2.54)) image.setDotsPerMeterY(self.dpi*(100/2.54)) painter = QPainter(image) self.page.mainFrame().render(painter) painter.end() cheight = image.height() cwidth = image.width() pos = 0 while pos < cheight: img = image.copy(0, pos, cwidth, min(cheight-pos, cutoff_height)) pos += cutoff_height-20 if cwidth > self.width: img = img.scaledToWidth(self.width, Qt.SmoothTransform) f = os.path.join(self.tdir, '%d.png'%pos) img.save(f) self.images.append((f, img.width(), img.height())) finally: QApplication.quit()
def reply_finished(self, datareply): data = datareply.readAll() page = QWebPage() page.mainFrame().setContent(data) webpage = page.mainFrame().documentElement() if self.wordclass == "definition": result = webpage.findAll("div#contentbox") a = "" if not self.combo2_index: a = "div.tlf_cvedette" if 1 <= self.combo2_index <= 3: a = "span.tlf_cvedette" result_to_remove = webpage.findAll(a) string_to_remove = result_to_remove.first().toInnerXml() final_page = result.first().toInnerXml() resultf = final_page.replace(string_to_remove, '') result_box = webpage.findFirst('div#vtoolbar') result_test = result_box.findAll("a[href]") self.formtype = [] i = 0 while i < len(result_test): multdef_a = unicode(result_test.at(i).toPlainText()) # Delete digits in definition title multdef_clean = ''.join(c for c in multdef_a if not c.isdigit()) self.formtype.append(multdef_clean) i += 1 self._reply = resultf, self.formtype if self.wordclass == "synonyme" or self.wordclass == "antonyme": self._reply = [] result = webpage.findAll("td." + self.wordclass[:4] + "_format") tag = [] i = 0 while i < len(result): tag.append(result.at(i).firstChild().toPlainText()) i += 1 self._reply = tag
def __init__(self): self.application = app wp = QWebPage() wp.setForwardUnsupportedContent(True) wp.loadFinished.connect(self._on_load_finished) wp.loadStarted.connect(self._on_load_started) self.webpage = wp self.webframe = wp.mainFrame() self.headers = [] self._load_timeout = -1 self._load_success = False self.setSettings()
def run(self): for in_url in self.in_: webpage = QWebPage() webpage.setNetworkAccessManager(self.am) webpage.connect(webpage, SIGNAL('loadProgress(int)'), self.loadProgress) webpage.connect(webpage, SIGNAL('loadFinished(bool)'), self.loadFinished) webpage.connect(webpage, SIGNAL('loadStarted()'), self.loadStarted) webpage.settings().setAttribute(QWebSettings.JavaEnabled, self.enable_plugins) webpage.settings().setAttribute(QWebSettings.JavascriptEnabled, not self.disable_javascript) webpage.settings().setAttribute(QWebSettings.JavascriptCanOpenWindows, False) webpage.settings().setAttribute(QWebSettings.JavascriptCanAccessClipboard, False) webpage.settings().setAttribute(QWebSettings.PrintElementBackgrounds, self.background) webpage.settings().setAttribute(QWebSettings.PluginsEnabled, self.enable_plugins) if self.user_style_sheet: webpage.settings().setUserStyleSheetUrl(self.guessUrlFromString(self.user_style_sheet)) url = in_url if url == '-': tmp = tempfile.NamedTemporaryFile(prefix='tmp', suffix='.html') tmp.write(sys.stdin.read()) url = tmp.name url = self.guessUrlFromString(url) webpage.mainFrame().load(url) self.pages.append(webpage)
def webpageScreenshot(self, html): """Take a screenshot of a given html document and return it as a QImage.""" # see http://www.blogs.uni-osnabrueck.de/rotapken/2008/12/03/create-screenshots-of-a-web-page-using-python-and-qtwebkit/ size = self.size() # size = self.collectionView.page().viewportSize() # seems to be wrongly initialized sometimes... webpage = QWebPage() webpage.setLinkDelegationPolicy(QWebPage.DelegateAllLinks) webpage.setViewportSize(size) webpage.mainFrame().setHtml(html) # need to wait for the different elements to have loaded completely if sys.platform == "linux2": while QApplication.hasPendingEvents(): QApplication.processEvents() else: QApplication.processEvents() image = QImage(size, QImage.Format_ARGB32) painter = QPainter(image) webpage.mainFrame().render(painter) painter.end() return image
def __init__(self): self.application = app self.logger = Logger.getLogger() wp = QWebPage() wp.setForwardUnsupportedContent(True) wp.loadFinished.connect(self._on_load_finished) wp.loadStarted.connect(self._on_load_started) self.webpage = wp self.webframe = wp.mainFrame() self.headers = [] self._load_timeout = -1 self._load_success = False self.setSettings()
class CSConverter(QObject): def __init__(self, parent=None): QObject.__init__(self, parent) self.m_webPage = QWebPage(self) converter = QFile(':/resources/coffee-script.js') if not converter.open(QFile.ReadOnly): sys.exit('CoffeeScript compiler is not available!') script = str(converter.readAll()) converter.close() self.m_webPage.mainFrame().evaluateJavaScript(script) self.m_webPage.mainFrame().addToJavaScriptWindowObject( 'converter', self) def convert(self, script): self.setProperty('source', script) result = self.m_webPage.mainFrame().evaluateJavaScript('''try { [true, this.CoffeeScript.compile(converter.source)]; } catch (error) { [false, error.message]; } ''') return result
class Sandbox(object): """Execute javascript in webkit browser.""" def __init__(self, parent=None): app = QApplication(['dummy']) self.app = app self.webpage = QWebPage() self.webframe = self.webpage.mainFrame() self.webframe.load(QUrl('')) def execute(self, script): a = self.webframe.evaluateJavaScript(script) if a: return str(a.toString()) def close(self): self.app.exit()
class Sandbox(object): """Execute javascript in webkit browser.""" def __init__(self, parent=None): app = QApplication(['dummy']) self.app = app self.webpage = QWebPage() self.webframe = self.webpage.mainFrame() self.webframe.load(QUrl('')) def execute(self, script): a = self.webframe.evaluateJavaScript(script) if a: return str(a.toString()) def close(self): self.app.exit()
class Crawler: geneToOrthologs = {} geneToSpecies = {} geneSequences = {} geneFamilies = None # A list of sets containing the proteins in that family allSpecies = None species1Names = None species2Names = None speciesPairs = [] malformedXMLFiles = [] def main(self): if not os.path.isdir(run_name): os.mkdir(run_name) if not os.path.isdir(run_name + '/clustalin'): os.mkdir(run_name + '/clustalin') if not os.path.isdir(run_name + '/clustalout'): os.mkdir(run_name + '/clustalout') if not os.path.isdir(run_name + '/roundup'): os.mkdir(run_name + '/roundup') if not os.path.isdir(run_name + '/mktest_out'): os.mkdir(run_name + '/mktest_out') self.load_species_names_list() self.fetch_uncached_orthologs() self.load_gene_list() self.find_gene_families() # self.output_gene_families() self.fetch_gene_sequences() self.align_families() self.mktest_families() exit(0) ############################################# load_species_name_list ############################################# def load_species_names_list(self): if os.path.isfile('%s/species_names.json' % run_name): print "Loading cached species names..." sn = cjson.decode(open('%s/species_names.json' % run_name).read()) self.allSpecies = sn['allSpecies'] self.species1Names = sn['species1Names'] self.species2Names = sn['species2Names'] else: print "Fetching species names..." self.webpage = QWebPage() self.webpage.loadFinished.connect(self.process_organism_list) self.webpage.mainFrame().load( QUrl('http://roundup.hms.harvard.edu/retrieve/')) while self.allSpecies == None: time.sleep(.05) appInstance.processEvents() def process_organism_list(self, bool): organisms_query = 'select#id_genome_choices' organisms_element = self.webpage.mainFrame().findAllElements( organisms_query).at(0) elmt = organisms_element.firstChild() self.allSpecies = [] while True: if elmt == organisms_element.lastChild(): break self.allSpecies.append(str(elmt.attribute('value'))) elmt = elmt.nextSibling() self.species1Names = filter(is_species_1, self.allSpecies) self.species2Names = filter(is_species_2, self.allSpecies) s_cnt, s1_cnt, s2_cnt = len(self.allSpecies), len( self.species1Names), len(self.species2Names) print "Found %i species, %i of type 1 and %i of type 2." % ( s_cnt, s1_cnt, s2_cnt) savedict = { 'allSpecies': self.allSpecies, 'species1Names': self.species1Names, 'species2Names': self.species2Names } open('%s/species_names.json' % run_name, 'w').write(cjson.encode(savedict)) ############################################# fetch_uncached_orthologs ############################################# def fetch_uncached_orthologs(self): self.downloader_pool = eventlet.greenpool.GreenPool(size=5) self.pairs_to_download = [] bridge_pairs = bridges(self.species1Names, self.species2Names) print "Bridges:\n\t%s" % ('\n\t'.join( itertools.starmap(self.cache_name, bridge_pairs))) combs1 = len(self.species1Names) * (len(self.species1Names) - 1) / 2 combs2 = len(self.species2Names) * (len(self.species2Names) - 1) / 2 self.speciesPairs.extend(bridge_pairs) self.speciesPairs.extend(itertools.combinations(self.species1Names, 2)) self.speciesPairs.extend(itertools.combinations(self.species2Names, 2)) print "That's %i combinations of species1, %i of species2, %i bridges." % ( combs1, combs2, len(bridge_pairs)) numPairs = len(self.speciesPairs) for i in xrange(numPairs): l, r = self.speciesPairs[i] if i % 20 == 0: print "%i%% (%i/%i)\x1B[1F" % (int( i * 100.0 / numPairs), i, numPairs) if not os.path.isfile('%s/roundup/%s.xml' % (run_name, self.cache_name(l, r))): self.pairs_to_download.append((l, r)) num_to_dl = len(self.pairs_to_download) print "Fetching %i uncached combinations of species..." % num_to_dl pdp = self.downloader_pool.imap(self.fetch_pair, self.pairs_to_download) i = 0 for response in pdp: i += 1 cachename = self.cache_name(*response) print "%i%% (%i/%i): %s\x1B[1F" % (int( i * 100.0 / num_to_dl), i, num_to_dl, cachename) def cache_name(self, lSpecies, rSpecies): name = lSpecies + '---' + rSpecies valid_chrs = '-_.() %s%s' % (string.ascii_letters, string.digits) filename = ''.join(c for c in name if c in valid_chrs) return filename def fetch_pair(self, (lSpecies, rSpecies)): while True: try: self.attempt_fetch_pair((lSpecies, rSpecies)) break except urllib2.URLError as e: print "Error fetching (%s,%s): %s" % (lSpecies, rSpecies, e) return (lSpecies, rSpecies)
class Browser: """ Stateful programmatic web browser class based upon QtWebKit. >>> browser = Browser() >>> browser.load("http://www.wordreference.com") >>> browser.runjs("console.log('I can run Javascript!')") >>> browser.runjs("_jQuery('div').css('border', 'solid red')") # and jQuery! >>> browser.select("#esen") >>> browser.fill("input[name=enit]", "hola") >>> browser.click("input[name=b]", wait_load=True) >>> print browser.url, len(browser.html) >>> browser.close() """ ignore_ssl_errors = True """@ivar: If True, ignore SSL certificate errors.""" user_agent = None """@ivar: User agent for requests (see QWebPage::userAgentForUrl for details)""" jslib = "_jQuery" """@ivar: Library name for jQuery library injected by default to pages.""" download_directory = "." """@ivar: Directory where downloaded files will be stored.""" debug_stream = sys.stderr """@ivar: File-like stream where debug output will be written.""" debug_level = ERROR """@ivar: Debug verbose level (L{ERROR}, L{WARNING}, L{INFO} or L{DEBUG}).""" event_looptime = 0.01 """@ivar: Event loop dispatcher loop delay (seconds).""" _javascript_files = ["jquery.min.js", "jquery.simulate.js"] _javascript_directories = [ os.path.join(os.path.dirname(__file__), "../javascript"), os.path.join(sys.prefix, "share/spynner/javascript"), ] def __init__(self, qappargs=None, debug_level=None): """ Init a Browser instance. @param qappargs: Arguments for QApplication constructor. @param debug_level: Debug level logging (L{ERROR} by default) """ self.application = QApplication(qappargs or []) """PyQt4.QtGui.Qapplication object.""" if debug_level is not None: self.debug_level = debug_level self.webpage = QWebPage() """PyQt4.QtWebKit.QWebPage object.""" self.webpage.userAgentForUrl = self._user_agent_for_url self.webframe = self.webpage.mainFrame() """PyQt4.QtWebKit.QWebFrame main webframe object.""" self.webview = None """PyQt4.QtWebKit.QWebView object.""" self._url_filter = None self._html_parser = None # Javascript directory = _first(self._javascript_directories, os.path.isdir) if not directory: raise SpynnerError("Cannot find javascript directory: %s" % self._javascript_directories) self.javascript = "".join( open(os.path.join(directory, fn)).read() for fn in self._javascript_files) self.webpage.javaScriptAlert = self._javascript_alert self.webpage.javaScriptConsoleMessage = self._javascript_console_message self.webpage.javaScriptConfirm = self._javascript_confirm self.webpage.javaScriptPrompt = self._javascript_prompt self._javascript_confirm_callback = None self._javascript_confirm_prompt = None # Network Access Manager and cookies self.manager = QNetworkAccessManager() """PyQt4.QtNetwork.QTNetworkAccessManager object.""" self.manager.createRequest = self._manager_create_request self.webpage.setNetworkAccessManager(self.manager) self.cookiesjar = _ExtendedNetworkCookieJar() """PyQt4.QtNetwork.QNetworkCookieJar object.""" self.manager.setCookieJar(self.cookiesjar) self.manager.connect( self.manager, SIGNAL("sslErrors(QNetworkReply *, const QList<QSslError> &)"), self._on_manager_ssl_errors) self.manager.connect(self.manager, SIGNAL('finished(QNetworkReply *)'), self._on_reply) self.manager.connect( self.manager, SIGNAL( 'authenticationRequired(QNetworkReply *, QAuthenticator *)'), self._on_authentication_required) self._operation_names = dict( (getattr(QNetworkAccessManager, s + "Operation"), s.lower()) for s in ("Get", "Head", "Post", "Put")) # Webpage slots self._load_status = None self._replies = 0 self.webpage.setForwardUnsupportedContent(True) self.webpage.connect(self.webpage, SIGNAL('unsupportedContent(QNetworkReply *)'), self._on_unsupported_content) self.webpage.connect(self.webpage, SIGNAL('loadFinished(bool)'), self._on_load_finished) self.webpage.connect(self.webpage, SIGNAL("loadStarted()"), self._on_load_started) def _events_loop(self, wait=None): if wait is None: wait = self.event_looptime self.application.processEvents() time.sleep(wait) def _on_load_started(self): self._load_status = None self._debug(INFO, "Page load started") def _on_manager_ssl_errors(self, reply, errors): url = unicode(reply.url().toString()) if self.ignore_ssl_errors: self._debug(WARNING, "SSL certificate error ignored: %s" % url) reply.ignoreSslErrors() else: self._debug(WARNING, "SSL certificate error: %s" % url) def _on_authentication_required(self, reply, authenticator): url = unicode(reply.url().toString()) realm = unicode(authenticator.realm()) self._debug("HTTP auth required: %s (realm: %s)" % (url, realm)) if not self._http_authentication_callback: self._debug(WARNING, "HTTP auth required, but no callback defined") return credentials = self._http_authentication_callback(url, realm) if credentials: user, password = credentials self._debug( INFO, "callback returned HTTP credentials: %s/%s" % (user, "*" * len(password))) authenticator.setUser(user) authenticator.setPassword(password) else: self._debug(WARNING, "HTTP auth callback returned no credentials") def _manager_create_request(self, operation, request, data): url = unicode(request.url().toString()) operation_name = self._operation_names[operation].upper() self._debug(INFO, "Request: %s %s" % (operation_name, url)) for h in request.rawHeaderList(): self._debug(DEBUG, " %s: %s" % (h, request.rawHeader(h))) if self._url_filter: if self._url_filter(self._operation_names[operation], url) is False: self._debug(INFO, "URL filtered: %s" % url) request.setUrl(QUrl("about:blank")) else: self._debug(DEBUG, "URL not filtered: %s" % url) reply = QNetworkAccessManager.createRequest(self.manager, operation, request, data) return reply def _on_reply(self, reply): self._replies += 1 url = unicode(reply.url().toString()) if reply.error(): self._debug( WARNING, "Reply error: %s - %d (%s)" % (url, reply.error(), reply.errorString())) else: self._debug(INFO, "Reply successful: %s" % url) for header in reply.rawHeaderList(): self._debug(DEBUG, " %s: %s" % (header, reply.rawHeader(header))) def _on_unsupported_content(self, reply, outfd=None): if not reply.error(): self._start_download(reply, outfd) else: self._debug( ERROR, "Error on unsupported content: %s" % reply.errorString()) def _javascript_alert(self, webframe, message): self._debug(INFO, "Javascript alert: %s" % message) if self.webview: QWebPage.javaScriptAlert(self.webpage, webframe, message) def _javascript_console_message(self, message, line, sourceid): if line: self._debug( INFO, "Javascript console (%s:%d): %s" % (sourceid, line, message)) else: self._debug(INFO, "Javascript console: %s" % message) def _javascript_confirm(self, webframe, message): smessage = unicode(message) url = webframe.url() self._debug( INFO, "Javascript confirm (webframe url = %s): %s" % (url, smessage)) if self._javascript_confirm_callback: value = self._javascript_confirm_callback(url, smessage) self._debug(INFO, "Javascript confirm callback returned %s" % value) return value return QWebPage.javaScriptConfirm(self.webpage, webframe, message) def _javascript_prompt(self, webframe, message, defaultvalue, result): url = webframe.url() smessage = unicode(message) self._debug( INFO, "Javascript prompt (webframe url = %s): %s" % (url, smessage)) if self._javascript_prompt_callback: value = self._javascript_prompt_callback(url, smessage, defaultvalue) self._debug(INFO, "Javascript prompt callback returned: %s" % value) if value in (False, None): return False result.clear() result.append(value) return True return QWebPage.javaScriptPrompt(self.webpage, webframe, message, defaultvalue, result) def _on_webview_destroyed(self, window): self.webview = None def _on_load_finished(self, successful): self._load_status = successful status = {True: "successful", False: "error"}[successful] self._debug( INFO, "Page load finished (%d bytes): %s (%s)" % (len(self.html), self.url, status)) def _get_filepath_for_url(self, url): urlinfo = urlparse.urlsplit(url) path = os.path.join(self.download_directory, urlinfo.netloc + urlinfo.path) if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) return path def _start_download(self, reply, outfd): def _on_ready_read(): data = reply.readAll() if not hasattr(reply, "downloaded_nbytes"): reply.downloaded_nbytes = 0 reply.downloaded_nbytes += len(data) outfd.write(data) self._debug( DEBUG, "Read from download stream (%d bytes): %s" % (len(data), url)) def _on_network_error(): self.debug(ERROR, "Network error on download: %s" % url) def _on_finished(): self._debug(INFO, "Download finished: %s" % url) url = unicode(reply.url().toString()) if outfd is None: path = self._get_filepath_for_url(url) outfd = open(path, "wb") reply.connect(reply, SIGNAL("readyRead()"), _on_ready_read) reply.connect(reply, SIGNAL("NetworkError()"), _on_network_error) reply.connect(reply, SIGNAL("finished()"), _on_finished) self._debug(INFO, "Start download: %s" % url) def _wait_load(self, timeout=None): self._events_loop(0.0) if self._load_status is not None: load_status = self._load_status self._load_status = None return load_status itime = time.time() while self._load_status is None: if timeout and time.time() - itime > timeout: raise SpynnerTimeout("Timeout reached: %d seconds" % timeout) self._events_loop() self._events_loop(0.0) if self._load_status: jscode = "var %s = jQuery.noConflict();" % self.jslib self.runjs(self.javascript + jscode, debug=False) self.webpage.setViewportSize( self.webpage.mainFrame().contentsSize()) load_status = self._load_status self._load_status = None return load_status def _debug(self, level, *args): if level <= self.debug_level: kwargs = dict(outfd=self.debug_stream) _debug(*args, **kwargs) def _user_agent_for_url(self, url): if self.user_agent: return self.user_agent return QWebPage.userAgentForUrl(self.webpage, url) def _runjs_on_jquery(self, name, code): code2 = "result = %s; result.length" % code if self.runjs(code2).toInt() < 1: raise SpynnerJavascriptError("error on %s: %s" % (name, code)) def _get_html(self): return unicode(self.webframe.toHtml()) #return str(self.webframe.toHtml().toAscii()) def _get_soup(self): if not self._html_parser: raise SpynnerError("Cannot get soup with no HTML parser defined") return self._html_parser(self.html) def _get_url(self): return unicode(self.webframe.url().toString()) # Properties url = property(_get_url) """Current URL.""" html = property(_get_html) """Rendered HTML in current page.""" soup = property(_get_soup) """HTML soup (see L{set_html_parser}).""" #{ Basic interaction with browser def load(self, url): """Load a web page and return status (a boolean).""" self.webframe.load(QUrl(url)) return self._wait_load() def click(self, selector, wait_load=False, wait_requests=None, timeout=None): """ Click any clickable element in page. @param selector: jQuery selector. @param wait_load: If True, it will wait until a new page is loaded. @param timeout: Seconds to wait for the page to load before raising an exception. @param wait_requests: How many requests to wait before returning. Useful for AJAX requests. By default this method will not wait for a page to load. If you are clicking a link or submit button, you must call this method with C{wait_load=True} or, alternatively, call L{wait_load} afterwards. However, the recommended way it to use L{click_link}. When a non-HTML file is clicked this method will download it. The file is automatically saved keeping the original structure (as wget --recursive does). For example, a file with URL I{http://server.org/dir1/dir2/file.ext} will be saved to L{download_directory}/I{server.org/dir1/dir2/file.ext}. """ jscode = "%s('%s').simulate('click')" % (self.jslib, selector) self._replies = 0 self._runjs_on_jquery("click", jscode) if wait_requests: while self._replies < wait_requests: self._events_loop() self._events_loop(0.0) if wait_load: return self._wait_load(timeout) def click_link(self, selector, timeout=None): """Click a link and wait for the page to load.""" return self.click(selector, wait_load=True, timeout=timeout) def click_ajax(self, selector, wait_requests=1, timeout=None): """Click a AJAX link and wait for the request to finish.""" return self.click(selector, wait_requests=wait_requests, timeout=timeout) def wait_load(self, timeout=None): """ Wait until the page is loaded. @param timeout: Time to wait (seconds) for the page load to complete. @return: Boolean state @raise SpynnerTimeout: If timeout is reached. """ return self._wait_load(timeout) def wait(self, waittime): """ Wait some time. @param waittime: Time to wait (seconds). This is an active wait, the events loop will be run, so it may be useful to wait for synchronous Javascript events that change the DOM. """ itime = time.time() while time.time() - itime < waittime: self._events_loop() def close(self): """Close Browser instance and release resources.""" if self.webview: self.destroy_webview() if self.webpage: del self.webpage @classmethod def configure_proxy(cls, hostname, port, user=None, password=None, proxy_type=QNetworkProxy.HttpProxy): """ Configure network proxy layer. @param proxy_type: see QNetworkProxy.ProxyType. Default: HttpProxy. @param hostname: Proxy hostname. @param port: Proxy port. @param username: Proxy username (optional). @param passwrod: Proxy password (optional). """ proxy = QNetworkProxy() proxy.setType(proxy_type) proxy.setHostName(hostname) proxy.setPort(port) if user and password is not None: proxy.setUser(user) proxy.setPassword(password) QNetworkProxy.setApplicationProxy(proxy) #} #{ Webview def create_webview(self, show=False): """Create a QWebView object and insert current QWebPage.""" if self.webview: raise SpynnerError("Cannot create webview (already initialized)") self.webview = QWebView() self.webview.setPage(self.webpage) window = self.webview.window() window.setAttribute(Qt.WA_DeleteOnClose) window.connect(window, SIGNAL('destroyed(QObject *)'), self._on_webview_destroyed) if show: self.show() def destroy_webview(self): """Destroy current QWebView.""" if not self.webview: raise SpynnerError("Cannot destroy webview (not initialized)") del self.webview def show(self): """Show webview browser.""" if not self.webview: raise SpynnerError("Webview is not initialized") self.webview.show() def hide(self): """Hide webview browser.""" if not self.webview: raise SpynnerError("Webview is not initialized") self.webview.hide() def browse(self): """Let the user browse the current page (infinite loop).""" if not self.webview: raise SpynnerError("Webview is not initialized") self.show() while self.webview: self._events_loop() #} #{ Form manipulation def fill(self, selector, value): """Fill an input text with a string value using a jQuery selector.""" escaped_value = value.replace("'", "\\'") jscode = "%s('%s').val('%s')" % (self.jslib, selector, escaped_value) self._runjs_on_jquery("fill", jscode) def check(self, selector): """Check an input checkbox using a jQuery selector.""" jscode = "%s('%s').attr('checked', true)" % (self.jslib, selector) self._runjs_on_jquery("check", jscode) def uncheck(self, selector): """Uncheck input checkbox using a jQuery selector""" jscode = "%s('%s').attr('checked', false)" % (self.jslib, selector) self._runjs_on_jquery("uncheck", jscode) def choose(self, selector): """Choose a radio input using a jQuery selector.""" jscode = "%s('%s').simulate('click')" % (self.jslib, selector) self._runjs_on_jquery("choose", jscode) def select(self, selector): """Choose a option in a select using a jQuery selector.""" jscode = "%s('%s').attr('selected', 'selected')" % (self.jslib, selector) self._runjs_on_jquery("select", jscode) submit = click_link #} #{ Javascript def runjs(self, jscode, debug=True): """ Inject Javascript code into the current context of page. @param jscode: Javascript code to injected. @param debug: Set to False to disable debug output for this injection. You can call Jquery even if the original page does not include it as Spynner injects the library for every loaded page. You must use C{_jQuery(...)} instead of of C{jQuery} or the common {$(...)} shortcut. @note: You can change the _jQuery alias (see L{jslib}). """ if debug: self._debug(DEBUG, "Run Javascript code: %s" % jscode) r = self.webpage.mainFrame().evaluateJavaScript(jscode) if not r.isValid(): r = self.webpage.mainFrame().evaluateJavaScript(jscode) return r def set_javascript_confirm_callback(self, callback): """ Set function callback for Javascript confirm pop-ups. By default Javascript confirmations are not answered. If the webpage you are working pops Javascript confirmations, be sure to set a callback for them. Calback signature: C{javascript_confirm_callback(url, message)} - url: Url where the popup was launched. - param message: String message. The callback should return a boolean (True meaning 'yes', False meaning 'no') """ self._javascript_confirm_callback = callback def set_javascript_prompt_callback(self, callback): """ Set function callback for Javascript prompt. By default Javascript prompts are not answered. If the webpage you are working pops Javascript prompts, be sure to set a callback for them. Callback signature: C{javascript_prompt_callback(url, message, defaultvalue)} - url: Url where the popup prompt was launched. - message: String message. - defaultvalue: Default value for prompt answer The callback should return a string with the answer or None to cancel the prompt. """ self._javascript_prompt_callback = callback #} #{ Cookies def get_cookies(self): """Return string containing the current cookies in Mozilla format.""" return self.cookiesjar.mozillaCookies() def set_cookies(self, string_cookies): """Set cookies from a string with Mozilla-format cookies.""" return self.cookiesjar.setMozillaCookies(string_cookies) #} #{ Download files def download(self, url, outfd=None): """ Download a given URL using current cookies. @param url: URL or path to download @param outfd: Output file-like stream. If None, return data string. @return: Bytes downloaded (None if something went wrong) @note: If url is a path, the current base URL will be pre-appended. """ def _on_reply(reply): url = unicode(reply.url().toString()) self._download_reply_status = not bool(reply.error()) self._download_reply_status = None if not urlparse.urlsplit(url).scheme: url = urlparse.urljoin(self.url, url) request = QNetworkRequest(QUrl(url)) # Create a new manager to process this download manager = QNetworkAccessManager() reply = manager.get(request) if reply.error(): raise SpynnerError("Download error: %s" % reply.errorString()) reply.downloaded_nbytes = 0 manager.setCookieJar(self.manager.cookieJar()) manager.connect(manager, SIGNAL('finished(QNetworkReply *)'), _on_reply) outfd_set = bool(outfd) if not outfd_set: outfd = StringIO() self._start_download(reply, outfd) while self._download_reply_status is None: self._events_loop() if outfd_set: return (reply.downloaded_nbytes if not reply.error() else None) else: return outfd.getvalue() #} #{ HTML and tag soup parsing def set_html_parser(self, parser): """ Set HTML parser used to generate the HTML L{soup}. @param parser: Callback called to generate the soup. When a HTML parser is set for a Browser, the property L{soup} returns the parsed HTML. """ self._html_parser = parser def html_contains(self, regexp): """Return True if current HTML contains a given regular expression.""" return bool(re.search(regexp, self.html)) #} #{ HTTP Authentication def set_http_authentication_callback(self, callback): """ Set HTTP authentication request callback. The callback must have this signature: C{http_authentication_callback(url, realm)}: - C{url}: URL where the requested was made. - C{realm}: Realm requiring authentication. The callback should return a pair of string containing (user, password) or None if you don't want to answer. """ self._http_authentication_callback = callback #} #{ Miscellaneous def snapshot(self, box=None, format=QImage.Format_ARGB32): """ Take an image snapshot of the current frame. @param box: 4-element tuple containing box to capture (x1, y1, x2, y2). If None, capture the whole page. @param format: QImage format (see QImage::Format_*). @return: A QImage image. Typical usage: >>> browser.load(url) >>> browser.snapshot().save("webpage.png") """ if box: x1, y1, x2, y2 = box w, h = (x2 - x1), (y2 - y1) image0 = QImage(QSize(x2, y2), format) painter = QPainter(image0) self.webpage.mainFrame().render(painter) painter.end() image = image0.copy(x1, y1, w, h) else: image = QImage(self.webpage.viewportSize(), format) painter = QPainter(image) self.webpage.mainFrame().render(painter) painter.end() return image def get_url_from_path(self, path): """Return the URL for a given path using the current URL as base.""" return urlparse.urljoin(self.url, path) def set_url_filter(self, url_filter): """ Set function callback to filter URL. By default all requested elements of a page are loaded. That includes stylesheets, images and many other elements that you may not need at all. Use this method to define the callback that will be called every time a new request is made. The callback must have this signature: C{my_url_filter(operation, url)}: - C{operation}: string with HTTP operation: C{get}, C{head}, C{post} or C{put}. - C{url}: requested item URL. It should return C{True} (proceed) or C{False} (reject). """ self._url_filter = url_filter
def save_webpage_screenshot(url, width, height, file_name=None): """Saves a screenshot of the webpage given in url into filename+".png" width and height, if given, are in pixels if not given, the browser's default dimensions will be used. Example: save_webpage_screenshot( "http://www.example.com", "example", width=1024, height=768 ) """ app = QApplication(sys.argv) signal.signal(signal.SIGINT, signal.SIG_DFL) webpage = QWebPage() # set page dimensions webpage.setViewportSize(QSize(int(width), int(height))) # display errors otherwise debugging is very difficult def print_error(message, lineNumber, sourceID): print "\n%(sourceID)s line %(lineNumber)i: \n %(message)s" % locals() webpage.javaScriptConsoleMessage = print_error if file_name is None: result = [] # register print request handler def onPrintRequested(virtual_browser_window): # print "onPrintRequested" # Paint this frame into an image image = QImage(webpage.viewportSize(), QImage.Format_ARGB32) painter = QPainter(image) virtual_browser_window.render(painter) painter.end() if file_name is not None: image.save(file_name + ".png") else: byte_array = QByteArray() buffer = QBuffer(byte_array) buffer.open(QIODevice.WriteOnly) image.save(buffer, format="PNG") result.append(str(byte_array)) if __name__ == "__main__": if file_name is None: sys.stdout.write(result[0]) sys.exit(0) else: app.quit() webpage.printRequested.connect(onPrintRequested) # load the page and wait for a print request webpage.mainFrame().load(QUrl(url)) app.exec_() if file_name is None: return result[0]
class Evaluator(object): _replacechars = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЪЭЮЯабвгдеёжзийклмнопрстуфхцчшщьъэюя' def __init__(self): self.app = QApplication([]) self.page = QWebPage() self.frame = self.page.mainFrame() def __del__(self): del self.frame del self.page del self.app def set_html(self, html): self.frame.setHtml(html) def reset_html(self): self.set_html('<html><head></head><body></body></html>') def eval_js(self, script): res = self.frame.evaluateJavaScript(str(script)) if type(res) == str: return res else: del res def get_html(self): res = self.frame.toHtml() s = str(res) del res return s def find_script(self, page): soup = BeautifulSoup(page) script = soup.body.find(text=re.compile('dеobfuscate_html\(\)')) s = str(script) return s def deobfuscate_form(self, script): self.reset_html() self.eval_js(script) self.eval_js('captcha_div2.innerHTML=dеobfuscate_html();') return self.get_html() def _filter_markup(self, form): for c in self._replacechars: form = form.replace(c, '') return form def find_capair(self, domain, form): '''Finds cahash on deobfuscated page.''' soup = BeautifulSoup(form) cahash = soup.body.find(attrs={'name': re.compile('cahash')}) if cahash: caimg = soup.body.find(attrs= {'src': re.compile(''.join(('\/a\d\.', domain, '\/i\/captcha\/')))}) if caimg: return [str(cahash.get('value')), str(caimg.get('src'))] else: raise exc.PermanentError('caimg not found in form') else: raise exc.PermanentError('cahash not found in form') def solve_capage(self, domain, page): s = self.find_script(page) if s: if type(domain) == bytes: domain = domain.decode('utf-8') form = self._filter_markup(self.deobfuscate_form(s)) return self.find_capair(domain, form) else: raise exc.PermanentError('Obfuscated html not found in page')
class Browser: """ Stateful programmatic web browser class based upon QtWebKit. >>> browser = Browser() >>> browser.load("http://www.wordreference.com") >>> browser.runjs("console.log('I can run Javascript!')") >>> browser.runjs("_jQuery('div').css('border', 'solid red')") # and jQuery! >>> browser.select("#esen") >>> browser.fill("input[name=enit]", "hola") >>> browser.click("input[name=b]", wait_load=True) >>> print browser.url, len(browser.html) >>> browser.close() """ ignore_ssl_errors = True """@ivar: If True, ignore SSL certificate errors.""" user_agent = None """@ivar: User agent for requests (see QWebPage::userAgentForUrl for details)""" jslib = "jq" """@ivar: Library name for jQuery library injected by default to pages.""" download_directory = "." """@ivar: Directory where downloaded files will be stored.""" debug_stream = sys.stderr """@ivar: File-like stream where debug output will be written.""" debug_level = ERROR """@ivar: Debug verbose level (L{ERROR}, L{WARNING}, L{INFO} or L{DEBUG}).""" event_looptime = 0.01 """@ivar: Event loop dispatcher loop delay (seconds).""" errorCode = None errorMessage = None _javascript_files = ["jquery.min.js", "jquery.simulate.js"] _javascript_directories = [ os.path.join(os.path.dirname(__file__), "../javascript"), os.path.join(sys.prefix, "share/spynner/javascript"), ] def __init__(self, qappargs=None, debug_level=None): """ Init a Browser instance. @param qappargs: Arguments for QApplication constructor. @param debug_level: Debug level logging (L{ERROR} by default) """ self.application = QApplication(qappargs or []) """PyQt4.QtGui.Qapplication object.""" if debug_level is not None: self.debug_level = debug_level self.webpage = QWebPage() """PyQt4.QtWebKit.QWebPage object.""" self.webpage.userAgentForUrl = self._user_agent_for_url self.webframe = self.webpage.mainFrame() """PyQt4.QtWebKit.QWebFrame main webframe object.""" self.webview = None """PyQt4.QtWebKit.QWebView object.""" self._url_filter = None self._html_parser = None # Javascript directory = _first(self._javascript_directories, os.path.isdir) if not directory: raise SpynnerError("Cannot find javascript directory: %s" % self._javascript_directories) self.javascript = "".join(open(os.path.join(directory, fn)).read() for fn in self._javascript_files) self.webpage.javaScriptAlert = self._javascript_alert self.webpage.javaScriptConsoleMessage = self._javascript_console_message self.webpage.javaScriptConfirm = self._javascript_confirm self.webpage.javaScriptPrompt = self._javascript_prompt self._javascript_confirm_callback = None self._javascript_confirm_prompt = None # Network Access Manager and cookies self.manager = QNetworkAccessManager() """PyQt4.QtNetwork.QTNetworkAccessManager object.""" self.manager.createRequest = self._manager_create_request self.webpage.setNetworkAccessManager(self.manager) self.cookiesjar = _ExtendedNetworkCookieJar() """PyQt4.QtNetwork.QNetworkCookieJar object.""" self.manager.setCookieJar(self.cookiesjar) self.manager.connect(self.manager, SIGNAL("sslErrors(QNetworkReply *, const QList<QSslError> &)"), self._on_manager_ssl_errors) self.manager.connect(self.manager, SIGNAL('finished(QNetworkReply *)'), self._on_reply) self.manager.connect(self.manager, SIGNAL('authenticationRequired(QNetworkReply *, QAuthenticator *)'), self._on_authentication_required) self._operation_names = dict( (getattr(QNetworkAccessManager, s + "Operation"), s.lower()) for s in ("Get", "Head", "Post", "Put")) # Webpage slots self._load_status = None self._replies = 0 self.webpage.setForwardUnsupportedContent(True) self.webpage.connect(self.webpage, SIGNAL('unsupportedContent(QNetworkReply *)'), self._on_unsupported_content) self.webpage.connect(self.webpage, SIGNAL('loadFinished(bool)'), self._on_load_finished) self.webpage.connect(self.webpage, SIGNAL("loadStarted()"), self._on_load_started) def _events_loop(self, wait=None): if wait is None: wait = self.event_looptime self.application.processEvents() time.sleep(wait) def _on_load_started(self): self._load_status = None self._debug(INFO, "Page load started") def _on_manager_ssl_errors(self, reply, errors): url = unicode(reply.url().toString()) if self.ignore_ssl_errors: self._debug(WARNING, "SSL certificate error ignored: %s" % url) reply.ignoreSslErrors() else: self._debug(WARNING, "SSL certificate error: %s" % url) def _on_authentication_required(self, reply, authenticator): url = unicode(reply.url().toString()) realm = unicode(authenticator.realm()) self._debug("HTTP auth required: %s (realm: %s)" % (url, realm)) if not self._http_authentication_callback: self._debug(WARNING, "HTTP auth required, but no callback defined") return credentials = self._http_authentication_callback(url, realm) if credentials: user, password = credentials self._debug(INFO, "callback returned HTTP credentials: %s/%s" % (user, "*"*len(password))) authenticator.setUser(user) authenticator.setPassword(password) else: self._debug(WARNING, "HTTP auth callback returned no credentials") def _manager_create_request(self, operation, request, data): url = unicode(request.url().toString()) operation_name = self._operation_names[operation].upper() self._debug(INFO, "Request: %s %s" % (operation_name, url)) for h in request.rawHeaderList(): self._debug(DEBUG, " %s: %s" % (h, request.rawHeader(h))) if self._url_filter: if self._url_filter(self._operation_names[operation], url) is False: self._debug(INFO, "URL filtered: %s" % url) request.setUrl(QUrl("about:blank")) else: self._debug(DEBUG, "URL not filtered: %s" % url) reply = QNetworkAccessManager.createRequest(self.manager, operation, request, data) return reply def _on_reply(self, reply): self._replies += 1 self._reply_url = unicode(reply.url().toString()) self._reply_status = not bool(reply.error()) if reply.error(): self._debug(WARNING, "Reply error: %s - %d (%s)" % (self._reply_url, reply.error(), reply.errorString())) self.errorCode = reply.error() self.errorMessage = reply.errorString() else: self._debug(INFO, "Reply successful: %s" % self._reply_url) for header in reply.rawHeaderList(): self._debug(DEBUG, " %s: %s" % (header, reply.rawHeader(header))) def _on_unsupported_content(self, reply, outfd=None): if not reply.error(): self._start_download(reply, outfd) else: self._debug(ERROR, "Error on unsupported content: %s" % reply.errorString()) def _javascript_alert(self, webframe, message): self._debug(INFO, "Javascript alert: %s" % message) if self.webview: QWebPage.javaScriptAlert(self.webpage, webframe, message) def _javascript_console_message(self, message, line, sourceid): if line: self._debug(INFO, "Javascript console (%s:%d): %s" % (sourceid, line, message)) else: self._debug(INFO, "Javascript console: %s" % message) def _javascript_confirm(self, webframe, message): smessage = unicode(message) url = webframe.url() self._debug(INFO, "Javascript confirm (webframe url = %s): %s" % (url, smessage)) if self._javascript_confirm_callback: value = self._javascript_confirm_callback(url, smessage) self._debug(INFO, "Javascript confirm callback returned %s" % value) return value return QWebPage.javaScriptConfirm(self.webpage, webframe, message) def _javascript_prompt(self, webframe, message, defaultvalue, result): url = webframe.url() smessage = unicode(message) self._debug(INFO, "Javascript prompt (webframe url = %s): %s" % (url, smessage)) if self._javascript_prompt_callback: value = self._javascript_prompt_callback(url, smessage, defaultvalue) self._debug(INFO, "Javascript prompt callback returned: %s" % value) if value in (False, None): return False result.clear() result.append(value) return True return QWebPage.javaScriptPrompt(self.webpage, webframe, message, defaultvalue, result) def _on_webview_destroyed(self, window): self.webview = None def _on_load_finished(self, successful): self._load_status = successful status = {True: "successful", False: "error"}[successful] self._debug(INFO, "Page load finished (%d bytes): %s (%s)" % (len(self.html), self.url, status)) def _get_filepath_for_url(self, url): urlinfo = urlparse.urlsplit(url) path = os.path.join(self.download_directory, urlinfo.netloc + urlinfo.path) if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) return path def _start_download(self, reply, outfd): def _on_ready_read(): data = reply.readAll() reply.downloaded_nbytes += len(data) outfd.write(data) self._debug(DEBUG, "Read from download stream (%d bytes): %s" % (len(data), url)) def _on_network_error(): self.debug(ERROR, "Network error on download: %s" % url) def _on_finished(): self._debug(INFO, "Download finished: %s" % url) url = unicode(reply.url().toString()) if outfd is None: path = self._get_filepath_for_url(url) outfd = open(path, "wb") reply.connect(reply, SIGNAL("readyRead()"), _on_ready_read) reply.connect(reply, SIGNAL("NetworkError()"), _on_network_error) reply.connect(reply, SIGNAL("finished()"), _on_finished) self._debug(INFO, "Start download: %s" % url) def _wait_load(self, timeout=None): self._events_loop(0.0) if self._load_status is not None: load_status = self._load_status self._load_status = None return load_status itime = time.time() while self._load_status is None: if timeout and time.time() - itime > timeout: raise SpynnerTimeout("Timeout reached: %d seconds" % timeout) self._events_loop() self._events_loop(0.0) if self._load_status: jscode = "var %s = jQuery.noConflict();" % self.jslib self.runjs(self.javascript + jscode, debug=False) self.webpage.setViewportSize(self.webpage.mainFrame().contentsSize()) load_status = self._load_status self._load_status = None return load_status def _debug(self, level, *args): if level <= self.debug_level: kwargs = dict(outfd=self.debug_stream) _debug(*args, **kwargs) def _user_agent_for_url(self, url): if self.user_agent: return self.user_agent return QWebPage.userAgentForUrl(self.webpage, url) def get_js_obj_length(self, res): if res.type() != res.Map: return False resmap = res.toMap() lenfield = QString(u'length') if lenfield not in resmap: return False return resmap[lenfield].toInt()[0] def jslen(self, selector): res = self.runjs("%s('%s')" % (self.jslib, selector)) return self.get_js_obj_length(res) def _runjs_on_jquery(self, name, code): res = self.runjs(code) if self.get_js_obj_length(res) < 1: raise SpynnerJavascriptError("error on %s: %s" % (name, code)) def _get_html(self): return unicode(self.webframe.toHtml()) def _get_soup(self): if not self._html_parser: raise SpynnerError("Cannot get soup with no HTML parser defined") return self._html_parser(self.html) def _get_url(self): return unicode(self.webframe.url().toString()) # Properties url = property(_get_url) """Current URL.""" html = property(_get_html) """Rendered HTML in current page.""" #soup = property(_get_soup) soup = None #change to none so that changes are retained through mulitple calls """HTML soup (see L{set_html_parser}).""" #{ Basic interaction with browser def load(self, url): """Load a web page and return status (a boolean).""" self.webframe.load(QUrl(url)) return self._wait_load() def load_request(self, req): """Load a network request and return status (a boolean).""" self.webframe.load(req) return self._wait_load() def wait_requests(self, wait_requests = None, url = None, url_regex = None): if wait_requests: while self._replies < wait_requests: self._events_loop() self._events_loop(0.0) if url_regex or url: last_replies = self._replies while True: if last_replies != self._replies: if url_regex: if re.search(url_regex, self._reply_url): break elif url: if url == self._reply_url: break self._events_loop() self._events_loop(0.0) def click(self, selector, wait_load=False, wait_requests=None, timeout=None): """ Click any clickable element in page. @param selector: jQuery selector. @param wait_load: If True, it will wait until a new page is loaded. @param timeout: Seconds to wait for the page to load before raising an exception. @param wait_requests: How many requests to wait before returning. Useful for AJAX requests. By default this method will not wait for a page to load. If you are clicking a link or submit button, you must call this method with C{wait_load=True} or, alternatively, call L{wait_load} afterwards. However, the recommended way it to use L{click_link}. When a non-HTML file is clicked this method will download it. The file is automatically saved keeping the original structure (as wget --recursive does). For example, a file with URL I{http://server.org/dir1/dir2/file.ext} will be saved to L{download_directory}/I{server.org/dir1/dir2/file.ext}. """ jscode = "%s('%s').simulate('click')" % (self.jslib, selector) self._replies = 0 self._runjs_on_jquery("click", jscode) self.wait_requests(wait_requests) if wait_load: return self._wait_load(timeout) def click_link(self, selector, timeout=None): """Click a link and wait for the page to load.""" return self.click(selector, wait_load=True, timeout=timeout) def click_ajax(self, selector, wait_requests=1, timeout=None): """Click a AJAX link and wait for the request to finish.""" return self.click(selector, wait_requests=wait_requests, timeout=timeout) def wait_load(self, timeout=None): """ Wait until the page is loaded. @param timeout: Time to wait (seconds) for the page load to complete. @return: Boolean state @raise SpynnerTimeout: If timeout is reached. """ return self._wait_load(timeout) def wait(self, waittime): """ Wait some time. @param waittime: Time to wait (seconds). This is an active wait, the events loop will be run, so it may be useful to wait for synchronous Javascript events that change the DOM. """ itime = time.time() while time.time() - itime < waittime: self._events_loop() def close(self): """Close Browser instance and release resources.""" if self.webview: self.destroy_webview() if self.webpage: del self.webpage #} #{ Webview def create_webview(self, show=False): """Create a QWebView object and insert current QWebPage.""" if self.webview: raise SpynnerError("Cannot create webview (already initialized)") self.webview = QWebView() self.webview.setPage(self.webpage) window = self.webview.window() window.setAttribute(Qt.WA_DeleteOnClose) window.connect(window, SIGNAL('destroyed(QObject *)'), self._on_webview_destroyed) if show: self.show() def destroy_webview(self): """Destroy current QWebView.""" if not self.webview: raise SpynnerError("Cannot destroy webview (not initialized)") del self.webview def show(self): """Show webview browser.""" if not self.webview: raise SpynnerError("Webview is not initialized") self.webview.show() def hide(self): """Hide webview browser.""" if not self.webview: raise SpynnerError("Webview is not initialized") self.webview.hide() def browse(self): """Let the user browse the current page (infinite loop).""" if not self.webview: raise SpynnerError("Webview is not initialized") self.show() while self.webview: self._events_loop() #} #{ Webframe def set_webframe_to_default(self): self.webframe = self.webpage.mainFrame() def set_webframe(self, framenumber): cf = self.webframe.childFrames() try: self.webframe = cf[int(framenumber)] except: raise SpynnerError("childframe does not exist") """Inject jquery into frame""" jscode = "var %s = jQuery.noConflict();" % self.jslib self.runjs(self.javascript + jscode, debug=False) #} #{ Form manipulation def fill(self, selector, value): """Fill an input text with a string value using a jQuery selector.""" escaped_value = value.replace("'", "\\'") jscode = "%s('%s').val('%s')" % (self.jslib, selector, escaped_value) self._runjs_on_jquery("fill", jscode) def check(self, selector): """Check an input checkbox using a jQuery selector.""" jscode = "%s('%s').attr('checked', true)" % (self.jslib, selector) self._runjs_on_jquery("check", jscode) def uncheck(self, selector): """Uncheck input checkbox using a jQuery selector""" jscode = "%s('%s').attr('checked', false)" % (self.jslib, selector) self._runjs_on_jquery("uncheck", jscode) def choose(self, selector, value): """Choose a radio input using a jQuery selector.""" escaped_value = value.replace("'", "\\'") jscode = "%s('%s').filter('[value=%s]').simulate('click')" % (self.jslib, selector, escaped_value) self._runjs_on_jquery("choose", jscode) def select(self, selector): """Choose a option in a select using a jQuery selector.""" jscode = "%s('%s').attr('selected', 'selected')" % (self.jslib, selector) self._runjs_on_jquery("select", jscode) submit = click_link #} #{ Javascript def runjs(self, jscode, debug=True): """ Inject Javascript code into the current context of page. @param jscode: Javascript code to injected. @param debug: Set to False to disable debug output for this injection. You can call Jquery even if the original page does not include it as Spynner injects the library for every loaded page. You must use C{jq(...)} instead of of C{jQuery} or the common {$(...)} shortcut. @note: You can change the jq alias (see L{jslib}). """ if debug: self._debug(DEBUG, "Run Javascript code: %s" % jscode) #XXX evaluating JS twice must be wrong but finding the bug is proving tricky... #JavaScriptCore/interpreter/Interpreter.cpp and JavaScriptCore/runtime/Completion.cpp #JavaScriptCore/runtime/Completion.cpp is catching an exception (sometimes) and #returning "TypeError: Type error" - BUT it looks like the JS does complete after #the function has already returned r = self.webframe.evaluateJavaScript(jscode) if r.isValid() == False: r = self.webframe.evaluateJavaScript(jscode) return r def set_javascript_confirm_callback(self, callback): """ Set function callback for Javascript confirm pop-ups. By default Javascript confirmations are not answered. If the webpage you are working pops Javascript confirmations, be sure to set a callback for them. Calback signature: C{javascript_confirm_callback(url, message)} - url: Url where the popup was launched. - param message: String message. The callback should return a boolean (True meaning 'yes', False meaning 'no') """ self._javascript_confirm_callback = callback def set_javascript_prompt_callback(self, callback): """ Set function callback for Javascript prompt. By default Javascript prompts are not answered. If the webpage you are working pops Javascript prompts, be sure to set a callback for them. Callback signature: C{javascript_prompt_callback(url, message, defaultvalue)} - url: Url where the popup prompt was launched. - message: String message. - defaultvalue: Default value for prompt answer The callback should return a string with the answer or None to cancel the prompt. """ self._javascript_prompt_callback = callback #} #{ Cookies def get_cookies(self): """Return string containing the current cookies in Mozilla format.""" return self.cookiesjar.mozillaCookies() def set_cookies(self, string_cookies): """Set cookies from a string with Mozilla-format cookies.""" return self.cookiesjar.setMozillaCookies(string_cookies) #} #{ Proxies def get_proxy(self): """Return string containing the current proxy.""" return self.manager.proxy() def set_proxy(self, string_proxy): """Set proxy [http|socks5]://username:password@hostname:port""" urlinfo = urlparse.urlparse(string_proxy) proxy = QNetworkProxy() if urlinfo.scheme == 'socks5' : proxy.setType(1) elif urlinfo.scheme == 'http' : proxy.setType(3) else : proxy.setType(2) self.manager.setProxy(proxy) return self.manager.proxy() proxy.setHostName(urlinfo.hostname) proxy.setPort(urlinfo.port) if urlinfo.username != None : proxy.setUser(urlinfo.username) else : proxy.setUser('') if urlinfo.password != None : proxy.setPassword(urlinfo.password) else : proxy.setPassword('') self.manager.setProxy(proxy) return self.manager.proxy() #} #{ Download files def download(self, url, outfd=None): """ Download a given URL using current cookies. @param url: URL or path to download @param outfd: Output file-like stream. If None, return data string. @return: Bytes downloaded (None if something went wrong) @note: If url is a path, the current base URL will be pre-appended. """ def _on_reply(reply): url = unicode(reply.url().toString()) self._download_reply_status = not bool(reply.error()) self._download_reply_status = None if not urlparse.urlsplit(url).scheme: url = urlparse.urljoin(self.url, url) request = QNetworkRequest(QUrl(url)) # Create a new manager to process this download manager = QNetworkAccessManager() reply = manager.get(request) if reply.error(): raise SpynnerError("Download error: %s" % reply.errorString()) reply.downloaded_nbytes = 0 manager.setCookieJar(self.manager.cookieJar()) manager.connect(manager, SIGNAL('finished(QNetworkReply *)'), _on_reply) outfd_set = bool(outfd) if not outfd_set: outfd = StringIO() self._start_download(reply, outfd) while self._download_reply_status is None: self._events_loop() if outfd_set: return (reply.downloaded_nbytes if not reply.error() else None) else: return outfd.getvalue() #} #{ HTML and tag soup parsing def set_html_parser(self, parser): """ Set HTML parser used to generate the HTML L{soup}. @param parser: Callback called to generate the soup. When a HTML parser is set for a Browser, the property L{soup} returns the parsed HTML. """ self._html_parser = parser def html_contains(self, regexp): """Return True if current HTML contains a given regular expression.""" return bool(re.search(regexp, self.html)) #} #{ HTTP Authentication def set_http_authentication_callback(self, callback): """ Set HTTP authentication request callback. The callback must have this signature: C{http_authentication_callback(url, realm)}: - C{url}: URL where the requested was made. - C{realm}: Realm requiring authentication. The callback should return a pair of string containing (user, password) or None if you don't want to answer. """ self._http_authentication_callback = callback #} #{ Miscellaneous def snapshot(self, box=None, format=QImage.Format_ARGB32): """ Take an image snapshot of the current frame. @param box: 4-element tuple containing box to capture (x1, y1, x2, y2). If None, capture the whole page. @param format: QImage format (see QImage::Format_*). @return: A QImage image. Typical usage: >>> browser.load(url) >>> browser.snapshot().save("webpage.png") """ if box: x1, y1, x2, y2 = box w, h = (x2 - x1), (y2 - y1) image0 = QImage(QSize(x2, y2), format) painter = QPainter(image0) self.webpage.mainFrame().render(painter) painter.end() image = image0.copy(x1, y1, w, h) else: image = QImage(self.webpage.viewportSize(), format) painter = QPainter(image) self.webpage.mainFrame().render(painter) painter.end() return image def get_url_from_path(self, path): """Return the URL for a given path using the current URL as base.""" return urlparse.urljoin(self.url, path) def set_url_filter(self, url_filter): """ Set function callback to filter URL. By default all requested elements of a page are loaded. That includes stylesheets, images and many other elements that you may not need at all. Use this method to define the callback that will be called every time a new request is made. The callback must have this signature: C{my_url_filter(operation, url)}: - C{operation}: string with HTTP operation: C{get}, C{head}, C{post} or C{put}. - C{url}: requested item URL. It should return C{True} (proceed) or C{False} (reject). """ self._url_filter = url_filter
class _WebkitRendererHelper(QObject): """This helper class is doing the real work. It is required to allow WebkitRenderer.render() to be called "asynchronously" (but always from Qt's GUI thread). """ def __init__(self, parent): """Copies the properties from the parent (WebkitRenderer) object, creates the required instances of QWebPage, QWebView and QMainWindow and registers some Slots. """ QObject.__init__(self) # Copy properties from parent for key, value in parent.__dict__.items(): setattr(self, key, value) # Create and connect required PyQt4 objects self._page = QWebPage() self._view = QWebView() self._view.setPage(self._page) self._window = QMainWindow() self._window.setCentralWidget(self._view) # Import QWebSettings for key, value in self.qWebSettings.iteritems(): self._page.settings().setAttribute(key, value) # Connect required event listeners self.connect( self._page, SIGNAL("loadFinished(bool)"), self._on_load_finished ) self.connect( self._page, SIGNAL("loadStarted()"), self._on_load_started ) self.connect( self._page.networkAccessManager(), SIGNAL("sslErrors(QNetworkReply *,const QList<QSslError>&)"), self._on_ssl_errors ) self.connect( self._page.networkAccessManager(), SIGNAL("finished(QNetworkReply *)"), self._on_each_reply ) # The way we will use this, it seems to be unesseccary to have # Scrollbars enabled. self._page.mainFrame().setScrollBarPolicy( Qt.Horizontal, Qt.ScrollBarAlwaysOff ) self._page.mainFrame().setScrollBarPolicy( Qt.Vertical, Qt.ScrollBarAlwaysOff ) self._page.settings().setUserStyleSheetUrl( QUrl("data:text/css,html,body{overflow-y:hidden !important;}") ) # Show this widget self._window.show() def __del__(self): """Clean up Qt4 objects. """ self._window.close() del self._window del self._view del self._page def render(self, url): """The real worker. Loads the page (_load_page) and awaits the end of the given 'delay'. While it is waiting outstanding QApplication events are processed. After the given delay, the Window or Widget (depends on the value of 'grabWholeWindow' is drawn into a QPixmap and postprocessed (_post_process_image). """ self._load_page(url, self.width, self.height, self.timeout) # Wait for end of timer. In this time, process # other outstanding Qt events. if self.wait > 0: if self.logger: self.logger.debug("Waiting %d seconds " % self.wait) waitToTime = time.time() + self.wait while time.time() < waitToTime and QApplication.hasPendingEvents(): QApplication.processEvents() if self.renderTransparentBackground: # Another possible drawing solution image = QImage(self._page.viewportSize(), QImage.Format_ARGB32) image.fill(QColor(255, 0, 0, 0).rgba()) # http://ariya.blogspot.com/2009/04/transparent-qwebview-and-qwebpage.html palette = self._view.palette() palette.setBrush(QPalette.Base, Qt.transparent) self._page.setPalette(palette) self._view.setAttribute(Qt.WA_OpaquePaintEvent, False) painter = QPainter(image) painter.setBackgroundMode(Qt.TransparentMode) self._page.mainFrame().render(painter) painter.end() else: if self.grabWholeWindow: # Note that this does not fully ensure that the # window still has the focus when the screen is # grabbed. This might result in a race condition. self._view.activateWindow() image = QPixmap.grabWindow(self._window.winId()) else: image = QPixmap.grabWidget(self._window) return self._post_process_image(image) def _load_page(self, url, width, height, timeout): """ This method implements the logic for retrieving and displaying the requested page. """ # This is an event-based application. So we have to wait until # "loadFinished(bool)" raised. cancelAt = time.time() + timeout self.__loading = True self.__loadingResult = False # Default # TODO: fromEncoded() needs to be used in some situations. Some # sort of flag should be passed in to WebkitRenderer maybe? #self._page.mainFrame().load(QUrl.fromEncoded(url)) self._page.mainFrame().load(QUrl(url)) while self.__loading: if timeout > 0 and time.time() >= cancelAt: raise RuntimeError("Request timed out on %s" % url) while QApplication.hasPendingEvents() and self.__loading: QCoreApplication.processEvents() if self.logger: self.logger.debug("Processing result") if not self.__loading_result: if self.logger: self.logger.warning("Failed to load %s" % url) raise BadURLException("Failed to load %s" % url) # Set initial viewport (the size of the "window") size = self._page.mainFrame().contentsSize() if self.logger: self.logger.debug("contentsSize: %s", size) if width > 0: size.setWidth(width) if height > 0: size.setHeight(height) self._window.resize(size) def _post_process_image(self, qImage): """If 'scaleToWidth' or 'scaleToHeight' are set to a value greater than zero this method will scale the image using the method defined in 'scaleRatio'. """ if self.scaleToWidth > 0 or self.scaleToHeight > 0: # Scale this image if self.scaleRatio == 'keep': ratio = Qt.KeepAspectRatio elif self.scaleRatio in ['expand', 'crop']: ratio = Qt.KeepAspectRatioByExpanding else: # 'ignore' ratio = Qt.IgnoreAspectRatio qImage = qImage.scaled( self.scaleToWidth, self.scaleToHeight, ratio ) if self.scaleRatio == 'crop': qImage = qImage.copy( 0, 0, self.scaleToWidth, self.scaleToHeight ) return qImage def _on_each_reply(self, reply): """Logs each requested uri""" self.logger.debug("Received %s" % (reply.url().toString())) # Eventhandler for "loadStarted()" signal def _on_load_started(self): """Slot that sets the '__loading' property to true.""" if self.logger: self.logger.debug("loading started") self.__loading = True # Eventhandler for "loadFinished(bool)" signal def _on_load_finished(self, result): """Slot that sets the '__loading' property to false and stores the result code in '__loading_result'. """ if self.logger: self.logger.debug("loading finished with result %s", result) self.__loading = False self.__loading_result = result # Eventhandler for "sslErrors(QNetworkReply *,const QList<QSslError>&)" # signal. def _on_ssl_errors(self, reply, errors): """Slot that writes SSL warnings into the log but ignores them.""" for e in errors: if self.logger: self.logger.warn("SSL: " + e.errorString()) reply.ignoreSslErrors()
class WebkitRenderer(QObject): # Initializes the QWebPage object and registers some slots def __init__(self): logging.debug("Initializing class %s", self.__class__.__name__) self._page = QWebPage() self.connect(self._page, SIGNAL("loadFinished(bool)"), self.__on_load_finished) self.connect(self._page, SIGNAL("loadStarted()"), self.__on_load_started) # The way we will use this, it seems to be unesseccary to have Scrollbars enabled self._page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self._page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) # Helper for multithreaded communication through signals self.__loading = False self.__loading_result = False # Loads "url" and renders it. # Returns QImage-object on success. def render(self, url, width=0, height=0, timeout=10): logging.debug("render(%s, timeout=%d)", url, timeout) # This is an event-based application. So we have to wait until # "loadFinished(bool)" raised. cancelAt = time.time() + timeout self._page.mainFrame().load(QUrl(url)) while self.__loading: if timeout > 0 and time.time() >= cancelAt: raise RuntimeError("Request timed out") QCoreApplication.processEvents() logging.debug("Processing result") if self.__loading_result == False: raise RuntimeError("Failed to load %s" % url) # Set initial viewport (the size of the "window") size = self._page.mainFrame().contentsSize() if width > 0: size.setWidth(width) if height > 0: size.setHeight(height) self._page.setViewportSize(size) # Paint this frame into an image image = QImage(self._page.viewportSize(), QImage.Format_ARGB32) painter = QPainter(image) self._page.mainFrame().render(painter) painter.end() return image # Eventhandler for "loadStarted()" signal def __on_load_started(self): logging.debug("loading started") self.__loading = True # Eventhandler for "loadFinished(bool)" signal def __on_load_finished(self, result): logging.debug("loading finished with result %s", result) self.__loading = False self.__loading_result = result
class Crawler: geneToOrthologs = {} geneToSpecies = {} geneSequences = {} geneFamilies = None # A list of sets containing the proteins in that family allSpecies = None species1Names = None species2Names = None speciesPairs = [] malformedXMLFiles = [] def main(self): if not os.path.isdir(run_name): os.mkdir(run_name) if not os.path.isdir(run_name+'/clustalin'): os.mkdir(run_name+'/clustalin') if not os.path.isdir(run_name+'/clustalout'): os.mkdir(run_name+'/clustalout') if not os.path.isdir(run_name+'/roundup'): os.mkdir(run_name+'/roundup') if not os.path.isdir(run_name+'/mktest_out'): os.mkdir(run_name+'/mktest_out') self.load_species_names_list() self.fetch_uncached_orthologs() self.load_gene_list() self.find_gene_families() # self.output_gene_families() self.fetch_gene_sequences() self.align_families() self.mktest_families() exit(0) ############################################# load_species_name_list ############################################# def load_species_names_list(self): if os.path.isfile('%s/species_names.json'%run_name): print "Loading cached species names..." sn = cjson.decode(open('%s/species_names.json'%run_name).read()) self.allSpecies = sn['allSpecies'] self.species1Names = sn['species1Names'] self.species2Names = sn['species2Names'] else: print "Fetching species names..." self.webpage = QWebPage() self.webpage.loadFinished.connect(self.process_organism_list) self.webpage.mainFrame().load(QUrl('http://roundup.hms.harvard.edu/retrieve/')) while self.allSpecies == None: time.sleep(.05) appInstance.processEvents() def process_organism_list(self, bool): organisms_query = 'select#id_genome_choices' organisms_element = self.webpage.mainFrame().findAllElements(organisms_query).at(0) elmt = organisms_element.firstChild() self.allSpecies = [] while True: if elmt == organisms_element.lastChild(): break self.allSpecies.append(str(elmt.attribute('value'))) elmt = elmt.nextSibling() self.species1Names = filter(is_species_1, self.allSpecies) self.species2Names = filter(is_species_2, self.allSpecies) s_cnt, s1_cnt, s2_cnt = len(self.allSpecies), len(self.species1Names), len(self.species2Names) print "Found %i species, %i of type 1 and %i of type 2."%(s_cnt, s1_cnt, s2_cnt) savedict = {'allSpecies':self.allSpecies, 'species1Names':self.species1Names, 'species2Names':self.species2Names} open('%s/species_names.json'%run_name,'w').write(cjson.encode(savedict)) ############################################# fetch_uncached_orthologs ############################################# def fetch_uncached_orthologs(self): self.downloader_pool = eventlet.greenpool.GreenPool(size=5) self.pairs_to_download = [] bridge_pairs = bridges(self.species1Names, self.species2Names) print "Bridges:\n\t%s"%('\n\t'.join(itertools.starmap(self.cache_name, bridge_pairs))) combs1 = len(self.species1Names)*(len(self.species1Names)-1)/2 combs2 = len(self.species2Names)*(len(self.species2Names)-1)/2 self.speciesPairs.extend(bridge_pairs) self.speciesPairs.extend(itertools.combinations(self.species1Names,2)) self.speciesPairs.extend(itertools.combinations(self.species2Names,2)) print "That's %i combinations of species1, %i of species2, %i bridges."%(combs1,combs2,len(bridge_pairs)) numPairs = len(self.speciesPairs) for i in xrange(numPairs): l,r = self.speciesPairs[i] if i%20 == 0: print "%i%% (%i/%i)\x1B[1F"%(int(i*100.0/numPairs),i,numPairs) if not os.path.isfile('%s/roundup/%s.xml'%(run_name,self.cache_name(l,r))): self.pairs_to_download.append((l,r)) num_to_dl = len(self.pairs_to_download) print "Fetching %i uncached combinations of species..."%num_to_dl pdp = self.downloader_pool.imap(self.fetch_pair, self.pairs_to_download) i=0 for response in pdp: i+=1 cachename = self.cache_name(*response) print "%i%% (%i/%i): %s\x1B[1F"%(int(i*100.0/num_to_dl), i, num_to_dl, cachename) def cache_name(self, lSpecies, rSpecies): name = lSpecies+'---'+rSpecies valid_chrs = '-_.() %s%s'%(string.ascii_letters, string.digits) filename = ''.join(c for c in name if c in valid_chrs) return filename def fetch_pair(self, (lSpecies, rSpecies)): while True: try: self.attempt_fetch_pair((lSpecies,rSpecies)) break except urllib2.URLError as e: print "Error fetching (%s,%s): %s"%(lSpecies,rSpecies,e) return (lSpecies,rSpecies)
#screen = QtGui.QDesktopWidget().screenGeometry() size = webpage.mainFrame().contentsSize() # Set the size of the (virtual) browser window webpage.setViewportSize(webpage.mainFrame().contentsSize()) # Paint this frame into an image image = QImage(webpage.viewportSize(), QImage.Format_ARGB32) painter = QPainter(image) webpage.mainFrame().render(painter) painter.end() image.save("/tmp/output.png") sys.exit(0) qtargs = [sys.argv[0]] qtargs.append("-display") qtargs.append(":0") app = QApplication(qtargs,True) #app = QApplication(sys.argv) signal.signal(signal.SIGINT, signal.SIG_DFL) webpage = QWebPage() webpage.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) webpage.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) webpage.connect(webpage, SIGNAL("loadFinished(bool)"), onLoadFinished) webpage.mainFrame().load(QUrl(sys.argv[1])) sys.exit(app.exec_())
import sys import signal import os from PyQt4.QtCore import * from PyQt4.QtGui import * from PyQt4.QtWebKit import QWebPage app = QApplication(sys.argv) signal.signal(signal.SIGINT, signal.SIG_DFL) webpage = QWebPage() def onLoadFinished(result): if not result: print "Request failed" sys.exit(1) webpage.setViewportSize(webpage.mainFrame().contentsSize()) image = QImage(webpage.viewportSize(), QImage.Format_ARGB32) painter = QPainter(image) webpage.mainFrame().render(painter) painter.end() if os.path.exists("output.png"): os.remove("output.png") image.save("output.png") sys.exit(0) # quit this application webpage.mainFrame().load(QUrl("http://google.pl")) webpage.connect(webpage, SIGNAL("loadFinished(bool)"), onLoadFinished) sys.exit(app.exec_())