Ejemplo n.º 1
0
def extractComments(driver, commentXP, contentXP, authorXP, publishedXP):
  """Generic procedure to extract comments from precomputed xPaths.

  @type  driver: selenium.webdriver.phantomjs.webdriver.WebDriver
  @param driver: the driver
  @type  commentXP: string
  @param commentXP: the xPath to a comment nodes
  @type  contentXP: string
  @param contentXP: the xPath to comment contents
  @type  authorXP: string
  @param authorXP: the xPath to comment authors
  @type  publishedXP: string
  @param publishedXP: the xPath to comment publication dates
  @rtype: tuple of CommentItem
  @return: the extracted comments
  """
  try:
    page = driver.find_element_by_xpath(".//body").get_attribute("innerHTML")
  except (ElementNotVisibleException, NoSuchElementException):
    return tuple()
  parentNodeXP = "./ancestor::" + commentXP[2:]
  getParentNode = lambda node: (node.xpath(parentNodeXP) + [None])[0]
  nodesMapComments = OrderedDict(imap(
    lambda node: (node, CommentItem(
      content=extractFirst(node, contentXP),
      author=extractFirst(node, authorXP),
      published=extractFirst(node, publishedXP),
      parent=getParentNode(node))),
    parseHTML(page).xpath(commentXP)))
  foreach(
    lambda cmmnt: cmmnt.__setattr__("parent", nodesMapComments[cmmnt.parent]),
    ifilter(lambda _: _.parent is not None, nodesMapComments.values()))
  return tuple(ifilter(lambda _: _.content, nodesMapComments.values()))
Ejemplo n.º 2
0
  def __call__(self, parsedPage):
    """Extracts content from a page.

    @type  parsedPage: lxml.etree._Element
    @param parsedPage: the web page where content is extracted
    @rtype: tuple of strings
    @return: the extracted content
    """
    if self.needsRefresh:
      self._refresh()
    return tuple(imap(lambda _: extractFirst(parsedPage, _), self.xPaths))