Ejemplo n.º 1
0
  def __init__(self, storeUri, phantomjsPath):
    """Instantiate for a given storeUri. Creates the WebriverPool.

    @type  storeUri: string
    @param storeUri: the storeUri
    @type  storeUri: string
    @param storeUri: the phantomjsPath
    """
    self.store = FSFilesStore(storeUri)
    self.webdrivers = WebdriverPool(phantomjsPath)
Ejemplo n.º 2
0
class RenderJavascript(object):
  """Rendres the page with JavaScript, takes a screenshot and extract Disqus
  and Livefyre comments if present"""

  def __init__(self, storeUri, phantomjsPath):
    """Instantiate for a given storeUri. Creates the WebriverPool.

    @type  storeUri: string
    @param storeUri: the storeUri
    @type  storeUri: string
    @param storeUri: the phantomjsPath
    """
    self.store = FSFilesStore(storeUri)
    self.webdrivers = WebdriverPool(phantomjsPath)

  @classmethod
  def from_settings(cls, settings):
    """Instantiate with storeUri from settings.

    @type  settings: scrapy.settings.Settings
    @param settings: the settings
    @rtype: RenderJavascript
    @return: the instantiated class
    """
    if not settings['FILES_STORE']:
      raise CloseSpider("FILES_STORE setting needed to save screenshots.")
    if not settings['PHANTOMJS_PATH']:
      raise CloseSpider("PHANTOMJS_PATH setting needed to save screenshots.")
    return cls(settings['FILES_STORE'], settings['PHANTOMJS_PATH'])

  def close_spider(self, _):
    """Closes the WebriverPool."""
    self.webdrivers.stop()

  def process_item(self, item, _):
    """JavaScript render item's page in a new thread. Populates
    item.screenshot and item.comments if appropriate.

    @type  item: bibcrawl.model.postitem.PostItem
    @param item: the item to process
    @type  _: scrapy.spider.BaseSpider
    @param _: the spider that emitted this item
    @rtype: bibcrawl.model.postitem.PostItem
    @return: the processed item
    """
    defered = deferToThread(self.phantomJSProcess, item)
    defered.addCallback(lambda _: _)
    defered.addErrback(lambda _: item)
    return defered

  def phantomJSProcess(self, item):
    """Acquires an idle PhantomJS driver, loads the page, saves screenshot,
    download Disqus and LiveFyre comments present and release the driver.

    @type  item: bibcrawl.model.postitem.PostItem
    @param item: the item to process
    @rtype: bibcrawl.model.postitem.PostItem
    @return: the processed item
    """
    driver = self.webdrivers.acquire()
    driver.get(item.url)
    item.comments = disqusComments(driver) # + livefyreComments(driver)
    self.saveScreenshot(item, driver)
    self.webdrivers.release(driver)
    return item

  def saveScreenshot(self, item, driver):
    """Save a screeshot of the current page in storeUri://screen/<HASH>.png.

    @type  item: bibcrawl.model.postitem.PostItem
    @param item: the item to process
    @type  driver: selenium.webdriver.phantomjs.webdriver.WebDriver
    @param driver: the driver
    """
    uid = sha1(item.url).hexdigest()
    png = StringIO(driver.get_screenshot_as_png())
    key = 'screen/{0}.png'.format(uid)
    self.store.persist_file(key, png, None)
    item.screenshot = key