def extractComments(driver, commentXP, contentXP, authorXP, publishedXP): """Generic procedure to extract comments from precomputed xPaths. @type driver: selenium.webdriver.phantomjs.webdriver.WebDriver @param driver: the driver @type commentXP: string @param commentXP: the xPath to a comment nodes @type contentXP: string @param contentXP: the xPath to comment contents @type authorXP: string @param authorXP: the xPath to comment authors @type publishedXP: string @param publishedXP: the xPath to comment publication dates @rtype: tuple of CommentItem @return: the extracted comments """ try: page = driver.find_element_by_xpath(".//body").get_attribute("innerHTML") except (ElementNotVisibleException, NoSuchElementException): return tuple() parentNodeXP = "./ancestor::" + commentXP[2:] getParentNode = lambda node: (node.xpath(parentNodeXP) + [None])[0] nodesMapComments = OrderedDict(imap( lambda node: (node, CommentItem( content=extractFirst(node, contentXP), author=extractFirst(node, authorXP), published=extractFirst(node, publishedXP), parent=getParentNode(node))), parseHTML(page).xpath(commentXP))) foreach( lambda cmmnt: cmmnt.__setattr__("parent", nodesMapComments[cmmnt.parent]), ifilter(lambda _: _.parent is not None, nodesMapComments.values())) return tuple(ifilter(lambda _: _.content, nodesMapComments.values()))
def __call__(self, parsedPage): """Extracts content from a page. @type parsedPage: lxml.etree._Element @param parsedPage: the web page where content is extracted @rtype: tuple of strings @return: the extracted content """ if self.needsRefresh: self._refresh() return tuple(imap(lambda _: extractFirst(parsedPage, _), self.xPaths))