Ejemplo n.º 1
0
class NewCrawl(RssCrawl):
  """newcrawl"""
  name = "newcrawl"

  def __init__(self, startat, maxdownloads=None):
    """Instantiate a newcrawl spider for a given start url maxdownloads.

    @type  startat: string
    @param startat: the starting point of the crawl
    @type  maxdownloads: integer
    @param maxdownloads: the maximum number of pages to download
    """
    super(self.__class__, self).__init__(startat)
    self.maxDownloads = maxdownloads
    self.downloadsSoFar = 0
    self.seen = set()
    self.isBlogPost = None
    self.priorityHeuristic = None

  def handleRssEntries(self, posts):
    """Handles all web-feed entries."""
    self.isBlogPost = buildUrlFilter(
      imap(lambda _: _.url, posts),
      self.logDebug)
    self.priorityHeuristic = PriorityHeuristic(self.isBlogPost)
    return iflatmap(lambda _: self.crawl(_), posts)

  def crawl(self, response):
    """Recursive crawling function emitting both PostItems in the item
    pipeline and further requests to be crawled.
    """
    parsedBody = parseHTML(response.body)
    if self.maxDownloads and self.downloadsSoFar > self.maxDownloads:
      reactor.stop()
    elif self.isBlogPost(response.url):
      # self.logInfo("> " + response.url)
      self.downloadsSoFar += 1
      yield PostItem(url=response.url, parsedBodies=(parsedBody,))

    newUrls = set(ifilter(
      lambda _: _ not in self.seen,
      extractLinks(parsedBody)))
    self.seen.update(newUrls)
    self.priorityHeuristic.feed(response.url, newUrls)
    for newUrl in newUrls:
      yield Request(
        url=newUrl,
        callback=self.crawl,
        priority=self.priorityHeuristic(newUrl))
Ejemplo n.º 2
0
 def handleRssEntries(self, posts):
   """Handles all web-feed entries."""
   self.isBlogPost = buildUrlFilter(
     imap(lambda _: _.url, posts),
     self.logDebug)
   self.priorityHeuristic = PriorityHeuristic(self.isBlogPost)
   return iflatmap(lambda _: self.crawl(_), posts)