def __init__(self, storeUri, phantomjsPath): """Instantiate for a given storeUri. Creates the WebriverPool. @type storeUri: string @param storeUri: the storeUri @type storeUri: string @param storeUri: the phantomjsPath """ self.store = FSFilesStore(storeUri) self.webdrivers = WebdriverPool(phantomjsPath)
class RenderJavascript(object): """Rendres the page with JavaScript, takes a screenshot and extract Disqus and Livefyre comments if present""" def __init__(self, storeUri, phantomjsPath): """Instantiate for a given storeUri. Creates the WebriverPool. @type storeUri: string @param storeUri: the storeUri @type storeUri: string @param storeUri: the phantomjsPath """ self.store = FSFilesStore(storeUri) self.webdrivers = WebdriverPool(phantomjsPath) @classmethod def from_settings(cls, settings): """Instantiate with storeUri from settings. @type settings: scrapy.settings.Settings @param settings: the settings @rtype: RenderJavascript @return: the instantiated class """ if not settings['FILES_STORE']: raise CloseSpider("FILES_STORE setting needed to save screenshots.") if not settings['PHANTOMJS_PATH']: raise CloseSpider("PHANTOMJS_PATH setting needed to save screenshots.") return cls(settings['FILES_STORE'], settings['PHANTOMJS_PATH']) def close_spider(self, _): """Closes the WebriverPool.""" self.webdrivers.stop() def process_item(self, item, _): """JavaScript render item's page in a new thread. Populates item.screenshot and item.comments if appropriate. @type item: bibcrawl.model.postitem.PostItem @param item: the item to process @type _: scrapy.spider.BaseSpider @param _: the spider that emitted this item @rtype: bibcrawl.model.postitem.PostItem @return: the processed item """ defered = deferToThread(self.phantomJSProcess, item) defered.addCallback(lambda _: _) defered.addErrback(lambda _: item) return defered def phantomJSProcess(self, item): """Acquires an idle PhantomJS driver, loads the page, saves screenshot, download Disqus and LiveFyre comments present and release the driver. @type item: bibcrawl.model.postitem.PostItem @param item: the item to process @rtype: bibcrawl.model.postitem.PostItem @return: the processed item """ driver = self.webdrivers.acquire() driver.get(item.url) item.comments = disqusComments(driver) # + livefyreComments(driver) self.saveScreenshot(item, driver) self.webdrivers.release(driver) return item def saveScreenshot(self, item, driver): """Save a screeshot of the current page in storeUri://screen/<HASH>.png. @type item: bibcrawl.model.postitem.PostItem @param item: the item to process @type driver: selenium.webdriver.phantomjs.webdriver.WebDriver @param driver: the driver """ uid = sha1(item.url).hexdigest() png = StringIO(driver.get_screenshot_as_png()) key = 'screen/{0}.png'.format(uid) self.store.persist_file(key, png, None) item.screenshot = key