def extractComments(driver, commentXP, contentXP, authorXP, publishedXP): """Generic procedure to extract comments from precomputed xPaths. @type driver: selenium.webdriver.phantomjs.webdriver.WebDriver @param driver: the driver @type commentXP: string @param commentXP: the xPath to a comment nodes @type contentXP: string @param contentXP: the xPath to comment contents @type authorXP: string @param authorXP: the xPath to comment authors @type publishedXP: string @param publishedXP: the xPath to comment publication dates @rtype: tuple of CommentItem @return: the extracted comments """ try: page = driver.find_element_by_xpath(".//body").get_attribute("innerHTML") except (ElementNotVisibleException, NoSuchElementException): return tuple() parentNodeXP = "./ancestor::" + commentXP[2:] getParentNode = lambda node: (node.xpath(parentNodeXP) + [None])[0] nodesMapComments = OrderedDict(imap( lambda node: (node, CommentItem( content=extractFirst(node, contentXP), author=extractFirst(node, authorXP), published=extractFirst(node, publishedXP), parent=getParentNode(node))), parseHTML(page).xpath(commentXP))) foreach( lambda cmmnt: cmmnt.__setattr__("parent", nodesMapComments[cmmnt.parent]), ifilter(lambda _: _.parent is not None, nodesMapComments.values())) return tuple(ifilter(lambda _: _.content, nodesMapComments.values()))
def process_item(self, item, spider): gcenabled = gc.isenabled() gc.disable() try: contentExtractor = lambda _: spider.contentExtractor(parseHTML(_)) boilerpipeExtractor = lambda _: Extractor(html=_).getText() gooseExtractor = lambda _: Goose().extract(raw_html=_).cleaned_text readabilityExtractor = lambda _: cleanTags(Document(_).summary()) # CE, BP, GO, RE ntimes = range(11) contents = map( lambda _: timeMeThis(partial(contentExtractor, item.rawHtml)), ntimes) boilerpipes = map( lambda _: timeMeThis(partial(boilerpipeExtractor, item.rawHtml)), ntimes) gooses = map( lambda _: timeMeThis(partial(gooseExtractor, item.rawHtml)), ntimes) readabilitys = map( lambda _: timeMeThis(partial(readabilityExtractor, item.rawHtml)), ntimes) log.msg("{} {} {} {} {} {} {} {}".format( mean(contents), std(contents), mean(boilerpipes), std(boilerpipes), mean(gooses), std(gooses), mean(readabilitys), std(readabilitys) )) finally: if gcenabled: gc.enable()
def handleRssEntries(self, posts): """Process the new RSS entry Responses. @type posts: scrapy.http.posts.html.HtmlResponse @param posts: the RSS entries Responses @rtype: generator of scrapy.item.Item @return: the next items to process """ return ( PostItem(url=_.url, parsedBodies=(parseHTML(_.body),)) for _ in posts if _.meta["u"] in self.newRssLinks)
def _refresh(self): """Refreshes the XPaths with the current pages. Called internally once per feed+ __call__ sequence.""" self.needsRefresh = False pageUrls = tuple(imap(lambda (url, _): url, self.urlZipPages)) entries = sorted( ifilter(lambda _: _.link in pageUrls, self.rssEntries), key=lambda _: _.link) parsedPages = tuple(imap( lambda (_, page): parseHTML(page), sorted( ifilter(lambda (url, _): url in self.rssLinks, self.urlZipPages), key=lambda (url, _): url)))
def parse(self, response): """Extract the RSS feed Requests from the starting page Response. @type response: scrapy.http.response.html.HtmlResponse @param response: the starting page @rtype: scrapy.http.request.Request @return: the RSS feed Request """ rssLinks = extractRssLinks(parseHTML(response.body), response.url) nextRequest = lambda _: Request( url=rssLinks.next(), callback=self.parseRss, errback=nextRequest, dont_filter=True) try: return nextRequest(None) except StopIteration: self.logError("No usable RSS feed.")
def crawl(self, response): """Recursive crawling function emitting both PostItems in the item pipeline and further requests to be crawled. """ parsedBody = parseHTML(response.body) if self.maxDownloads and self.downloadsSoFar > self.maxDownloads: reactor.stop() elif self.isBlogPost(response.url): # self.logInfo("> " + response.url) self.downloadsSoFar += 1 yield PostItem(url=response.url, parsedBodies=(parsedBody,)) newUrls = set(ifilter( lambda _: _ not in self.seen, extractLinks(parsedBody))) self.seen.update(newUrls) self.priorityHeuristic.feed(response.url, newUrls) for newUrl in newUrls: yield Request( url=newUrl, callback=self.crawl, priority=self.priorityHeuristic(newUrl))
def crawl(self, response): self.logInfo("START:" + response.meta["u"]) parsedBody = parseHTML(response.body) return PostItem(url=response.meta["u"], parsedBodies=(parsedBody,))