Python ContentExtractor Examples

Programming Language: Python

Namespace/Package Name: bibcrawl.utils.contentextractor

Class/Type: ContentExtractor

Examples at hotexamples.com: 2

Python ContentExtractor - 2 examples found. These are the top rated real world Python examples of bibcrawl.utils.contentextractor.ContentExtractor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

feed(1)

getRssLinks(1)

Example #1

Show file

File: rsscrawl.py Project: BlogForever/crawler

  def parseRss(self, response):
    """Extract entry Requests from the RSS feed.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the RSS feed
    @rtype: generator of scrapy.http.request.Request
    @return: the entry Requests
    """
    self.logInfo("Feed: {0}".format(response.url))
    self.contentExtractor = ContentExtractor(response.body, self.logInfo)
    return imap(
      lambda url: Request(
        url=url,
        callback=self.bufferEntries,
        errback=self.bufferEntries,
        dont_filter=True,
        # meta={ "u": _ } is here to keep a "safe" copy of the source url.
        # I don't trust response.url == (what was passed as Request url).
        meta={ "u": url }),
      self.contentExtractor.getRssLinks())

Example #2

Show file

File: rsscrawl.py Project: BlogForever/crawler

class RssCrawl(BaseSpider):
  """Initialize a crawl with a starting page by dowloading a RSS feed and all
  its entries."""

  name = "dummy"

  def __init__(self, startAt, domain=None):
    """Instantiate for a given start url and domaine.

    @type  startAt: string
    @param startAt: the starting point of the crawl
    @type  domain: string
    @param domain: the domaine of the crawl
    """
    super(RssCrawl, self).__init__(None)
    if domain is None:
      (_, domain, _, _, _) = urlsplit(startAt)
    self.allowed_domains = (domain ,)
    self.start_urls = (startAt, )
    self.contentExtractor = None
    self.bufferedPosts = list()
    self.name = "{0}@{1}".format(self.__class__.__name__, domain)

  def parse(self, response):
    """Extract the RSS feed Requests from the starting page Response.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the starting page
    @rtype: scrapy.http.request.Request
    @return: the RSS feed Request
    """
    rssLinks = extractRssLinks(parseHTML(response.body), response.url)
    nextRequest = lambda _: Request(
      url=rssLinks.next(),
      callback=self.parseRss,
      errback=nextRequest,
      dont_filter=True)
    try:
      return nextRequest(None)
    except StopIteration:
      self.logError("No usable RSS feed.")

  def parseRss(self, response):
    """Extract entry Requests from the RSS feed.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the RSS feed
    @rtype: generator of scrapy.http.request.Request
    @return: the entry Requests
    """
    self.logInfo("Feed: {0}".format(response.url))
    self.contentExtractor = ContentExtractor(response.body, self.logInfo)
    return imap(
      lambda url: Request(
        url=url,
        callback=self.bufferEntries,
        errback=self.bufferEntries,
        dont_filter=True,
        # meta={ "u": _ } is here to keep a "safe" copy of the source url.
        # I don't trust response.url == (what was passed as Request url).
        meta={ "u": url }),
      self.contentExtractor.getRssLinks())

  def bufferEntries(self, response):
    """Buffer the RSS entiry Responses, once there are all here pass the to
    the overridable handleRssEntries method.

    @type response: scrapy.http.response.html.HtmlResponse
    @param response: the RSS feed
    @rtype: generator of scrapy.http.request.Request
    @return: the entry Requests
    """
    self.bufferedPosts.append(response)
    if len(self.bufferedPosts) == len(self.contentExtractor.getRssLinks()):
      posts = tuple(ifilter(
        lambda _: isinstance(_, Response),
        self.bufferedPosts))
      foreach(lambda _: self.contentExtractor.feed(_.body, _.meta["u"]), posts)
      return self.handleRssEntries(posts)

  def logDebug(self, string):
    """Log with spider name at debug level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.DEBUG)

  def logInfo(self, string):
    """Log with spider name at info level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.INFO)

  def logWarning(self, string):
    """Log with spider name at warning level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.WARNING)

  def logError(self, string):
    """Log with spider name at error level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.ERROR)

  def logCritical(self, string):
    """Log with spider name at critical level.

    @type  string: string
    @param string: the string to log
    """
    self.log(string, log.CRITICAL)

  def handleRssEntries(self, posts):
    """Overridable method to process all RSS entry Responses.

    @type posts: scrapy.http.posts.html.HtmlResponse
    @param posts: the RSS entries Responses
    @rtype: generator of scrapy.http.request.Request and scrapy.item.Item
    @return: the next requests and items to process
    """
    pass