コード例 #1
0
def extractComments(driver, commentXP, contentXP, authorXP, publishedXP):
  """Generic procedure to extract comments from precomputed xPaths.

  @type  driver: selenium.webdriver.phantomjs.webdriver.WebDriver
  @param driver: the driver
  @type  commentXP: string
  @param commentXP: the xPath to a comment nodes
  @type  contentXP: string
  @param contentXP: the xPath to comment contents
  @type  authorXP: string
  @param authorXP: the xPath to comment authors
  @type  publishedXP: string
  @param publishedXP: the xPath to comment publication dates
  @rtype: tuple of CommentItem
  @return: the extracted comments
  """
  try:
    page = driver.find_element_by_xpath(".//body").get_attribute("innerHTML")
  except (ElementNotVisibleException, NoSuchElementException):
    return tuple()
  parentNodeXP = "./ancestor::" + commentXP[2:]
  getParentNode = lambda node: (node.xpath(parentNodeXP) + [None])[0]
  nodesMapComments = OrderedDict(imap(
    lambda node: (node, CommentItem(
      content=extractFirst(node, contentXP),
      author=extractFirst(node, authorXP),
      published=extractFirst(node, publishedXP),
      parent=getParentNode(node))),
    parseHTML(page).xpath(commentXP)))
  foreach(
    lambda cmmnt: cmmnt.__setattr__("parent", nodesMapComments[cmmnt.parent]),
    ifilter(lambda _: _.parent is not None, nodesMapComments.values()))
  return tuple(ifilter(lambda _: _.content, nodesMapComments.values()))
  def process_item(self, item, spider):
    gcenabled = gc.isenabled()
    gc.disable()
    try:
      contentExtractor = lambda _: spider.contentExtractor(parseHTML(_))
      boilerpipeExtractor = lambda _: Extractor(html=_).getText()
      gooseExtractor = lambda _: Goose().extract(raw_html=_).cleaned_text
      readabilityExtractor = lambda _: cleanTags(Document(_).summary())

      # CE, BP, GO, RE
      ntimes = range(11)
      contents = map(
        lambda _: timeMeThis(partial(contentExtractor, item.rawHtml)),
        ntimes)
      boilerpipes = map(
        lambda _: timeMeThis(partial(boilerpipeExtractor, item.rawHtml)),
        ntimes)
      gooses = map(
        lambda _: timeMeThis(partial(gooseExtractor, item.rawHtml)),
        ntimes)
      readabilitys = map(
        lambda _: timeMeThis(partial(readabilityExtractor, item.rawHtml)),
        ntimes)

      log.msg("{} {} {} {} {} {} {} {}".format(
        mean(contents), std(contents),
        mean(boilerpipes), std(boilerpipes),
        mean(gooses), std(gooses),
        mean(readabilitys), std(readabilitys)
      ))
    finally:
      if gcenabled:
        gc.enable()
コード例 #3
0
ファイル: updatecrawl.py プロジェクト: BlogForever/crawler
  def handleRssEntries(self, posts):
    """Process the new RSS entry Responses.

    @type posts: scrapy.http.posts.html.HtmlResponse
    @param posts: the RSS entries Responses
    @rtype: generator of scrapy.item.Item
    @return: the next items to process
    """
    return (
      PostItem(url=_.url, parsedBodies=(parseHTML(_.body),)) for _ in posts
      if _.meta["u"] in self.newRssLinks)
コード例 #4
0
  def _refresh(self):
    """Refreshes the XPaths with the current pages. Called internally once per
    feed+ __call__ sequence."""
    self.needsRefresh = False

    pageUrls = tuple(imap(lambda (url, _): url, self.urlZipPages))
    entries = sorted(
      ifilter(lambda _: _.link in pageUrls, self.rssEntries),
      key=lambda _: _.link)
    parsedPages = tuple(imap(
      lambda (_, page): parseHTML(page),
      sorted(
        ifilter(lambda (url, _): url in self.rssLinks, self.urlZipPages),
        key=lambda (url, _): url)))
コード例 #5
0
ファイル: rsscrawl.py プロジェクト: BlogForever/crawler
  def parse(self, response):
    """Extract the RSS feed Requests from the starting page Response.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the starting page
    @rtype: scrapy.http.request.Request
    @return: the RSS feed Request
    """
    rssLinks = extractRssLinks(parseHTML(response.body), response.url)
    nextRequest = lambda _: Request(
      url=rssLinks.next(),
      callback=self.parseRss,
      errback=nextRequest,
      dont_filter=True)
    try:
      return nextRequest(None)
    except StopIteration:
      self.logError("No usable RSS feed.")
コード例 #6
0
ファイル: newcrawl.py プロジェクト: BlogForever/crawler
  def crawl(self, response):
    """Recursive crawling function emitting both PostItems in the item
    pipeline and further requests to be crawled.
    """
    parsedBody = parseHTML(response.body)
    if self.maxDownloads and self.downloadsSoFar > self.maxDownloads:
      reactor.stop()
    elif self.isBlogPost(response.url):
      # self.logInfo("> " + response.url)
      self.downloadsSoFar += 1
      yield PostItem(url=response.url, parsedBodies=(parsedBody,))

    newUrls = set(ifilter(
      lambda _: _ not in self.seen,
      extractLinks(parsedBody)))
    self.seen.update(newUrls)
    self.priorityHeuristic.feed(response.url, newUrls)
    for newUrl in newUrls:
      yield Request(
        url=newUrl,
        callback=self.crawl,
        priority=self.priorityHeuristic(newUrl))
コード例 #7
0
 def crawl(self, response):
   self.logInfo("START:" + response.meta["u"])
   parsedBody = parseHTML(response.body)
   return PostItem(url=response.meta["u"], parsedBodies=(parsedBody,))