Esempio n. 1
0
  def parseRss(self, response):
    """Extract entry Requests from the RSS feed.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the RSS feed
    @rtype: generator of scrapy.http.request.Request
    @return: the entry Requests
    """
    self.logInfo("Feed: {0}".format(response.url))
    self.contentExtractor = ContentExtractor(response.body, self.logInfo)
    return imap(
      lambda url: Request(
        url=url,
        callback=self.bufferEntries,
        errback=self.bufferEntries,
        dont_filter=True,
        # meta={ "u": _ } is here to keep a "safe" copy of the source url.
        # I don't trust response.url == (what was passed as Request url).
        meta={ "u": url }),
      self.contentExtractor.getRssLinks())
Esempio n. 2
0
class RssCrawl(BaseSpider):
  """Initialize a crawl with a starting page by dowloading a RSS feed and all
  its entries."""

  name = "dummy"

  def __init__(self, startAt, domain=None):
    """Instantiate for a given start url and domaine.

    @type  startAt: string
    @param startAt: the starting point of the crawl
    @type  domain: string
    @param domain: the domaine of the crawl
    """
    super(RssCrawl, self).__init__(None)
    if domain is None:
      (_, domain, _, _, _) = urlsplit(startAt)
    self.allowed_domains = (domain ,)
    self.start_urls = (startAt, )
    self.contentExtractor = None
    self.bufferedPosts = list()
    self.name = "{0}@{1}".format(self.__class__.__name__, domain)

  def parse(self, response):
    """Extract the RSS feed Requests from the starting page Response.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the starting page
    @rtype: scrapy.http.request.Request
    @return: the RSS feed Request
    """
    rssLinks = extractRssLinks(parseHTML(response.body), response.url)
    nextRequest = lambda _: Request(
      url=rssLinks.next(),
      callback=self.parseRss,
      errback=nextRequest,
      dont_filter=True)
    try:
      return nextRequest(None)
    except StopIteration:
      self.logError("No usable RSS feed.")

  def parseRss(self, response):
    """Extract entry Requests from the RSS feed.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the RSS feed
    @rtype: generator of scrapy.http.request.Request
    @return: the entry Requests
    """
    self.logInfo("Feed: {0}".format(response.url))
    self.contentExtractor = ContentExtractor(response.body, self.logInfo)
    return imap(
      lambda url: Request(
        url=url,
        callback=self.bufferEntries,
        errback=self.bufferEntries,
        dont_filter=True,
        # meta={ "u": _ } is here to keep a "safe" copy of the source url.
        # I don't trust response.url == (what was passed as Request url).
        meta={ "u": url }),
      self.contentExtractor.getRssLinks())

  def bufferEntries(self, response):
    """Buffer the RSS entiry Responses, once there are all here pass the to
    the overridable handleRssEntries method.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the RSS feed
    @rtype: generator of scrapy.http.request.Request
    @return: the entry Requests
    """
    self.bufferedPosts.append(response)
    if len(self.bufferedPosts) == len(self.contentExtractor.getRssLinks()):
      posts = tuple(ifilter(
        lambda _: isinstance(_, Response),
        self.bufferedPosts))
      foreach(lambda _: self.contentExtractor.feed(_.body, _.meta["u"]), posts)
      return self.handleRssEntries(posts)

  def logDebug(self, string):
    """Log with spider name at debug level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.DEBUG)

  def logInfo(self, string):
    """Log with spider name at info level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.INFO)

  def logWarning(self, string):
    """Log with spider name at warning level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.WARNING)

  def logError(self, string):
    """Log with spider name at error level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.ERROR)

  def logCritical(self, string):
    """Log with spider name at critical level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.CRITICAL)

  def handleRssEntries(self, posts):
    """Overridable method to process all RSS entry Responses.

    @type posts: scrapy.http.posts.html.HtmlResponse
    @param posts: the RSS entries Responses
    @rtype: generator of scrapy.http.request.Request and scrapy.item.Item
    @return: the next requests and items to process
    """
    pass