Beispiel #1
0
    def __init__(self, sessions=None, settings=None):
        """
            Initializes the crawler

            params:

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True)
        self.settings = settings

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()

        pool_type = getattr(settings, 'POOL', 'greenlets')
        pool = Pools[pool_type]

        if self.max_concurrency_level is None:
            self.max_concurrency_level = pool['max_concurrency']

        self.pool = pool['pool'](self.max_concurrency_level)
        self.request_manager = RequestManager(settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation)

        self._initialize_scrapers()
Beispiel #2
0
    def __init__(self, sessions=None, debug=False):
        """
            Initializes the crawler

            params:

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """        

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = debug

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()        

        self.pool = GreenPool()
        self.request_manager = RequestManager()
        
        self._initialize_scrapers()
Beispiel #3
0
    def __init__(self, sessions=None, settings=None):
        """
            Initializes the crawler

            params:

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True)
        self.settings = settings

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()

        pool_type = getattr(settings, 'POOL', 'greenlets')
        pool = Pools[pool_type]

        if self.max_concurrency_level is None:
            self.max_concurrency_level = pool['max_concurrency']

        self.pool = pool['pool'](self.max_concurrency_level)
        self.request_manager = RequestManager(
            settings=settings,
            headers=self.headers,
            delay=self.requests_delay,
            deviation=self.requests_deviation)

        self._initialize_scrapers()
Beispiel #4
0
class BaseCrawler(object):
    """
        User's Crawlers must inherit from this class, may
        override some methods and define the start_urls list,
        the scrapers and the max crawling depth.
    """

    __metaclass__ = CrawlerMeta

    start_urls = []
    """ A list containing the start urls for the crawler """

    allowed_urls = []
    """ A list of urls allowed for crawl """

    black_list = []
    """ A list of blocked urls which never be crawled """

    scrapers = []
    """ A list of scrapers classes """

    max_depth = -1
    """ The maximun crawling recursive level """

    max_concurrency_level = None
    """ The maximun coroutines concurrecy level """

    headers = {}
    """ The default request headers """

    requests_delay = config.REQUEST_DELAY
    """ The average delay time between requests """

    requests_deviation = config.REQUEST_DEVIATION
    """ The requests deviation time """

    extractor = None
    """ The extractor class. Default is XPathExtractor """

    post_urls = []
    """
        The Post data for the urls. A List of tuples containing (url, data_dict)
        Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'})
    """

    login = None
    """
        The login data. A tuple of (url, login_dict).
        Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
    """

    search_all_urls = True
    """
        If user doesn't define the get_urls method in scrapers then the crawler will search for urls
        in the current page itself depending on the [search_all_urls] attribute.
    """

    search_hidden_urls = False
    """
        Search for hidden urls in the whole html
    """

    def __init__(self, sessions=None, settings=None):
        """
            Initializes the crawler

            params:

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True)
        self.settings = settings

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()

        pool_type = getattr(settings, 'POOL', 'greenlets')
        pool = Pools[pool_type]

        if self.max_concurrency_level is None:
            self.max_concurrency_level = pool['max_concurrency']

        self.pool = pool['pool'](self.max_concurrency_level)
        self.request_manager = RequestManager(settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation)

        self._initialize_scrapers()

    def _initialize_scrapers(self):
        """
            Instanciates all the scraper classes
        """

        self.scrapers = [scraper_class(settings=self.settings) for scraper_class in self.scrapers]

    def _make_request(self, url, data=None):
        """
            Returns the response object from a request

            params:
                data: if this param is present it makes a POST.
        """
        return self.request_manager.make_request(url, data, self.extractor)

    def _get_response(self, url, data=None):
        """
            Returns the response data from a request

            params:
                data: if this param is present it makes a POST.
        """

        for pattern, post_data in self.post_urls:
            if url_matcher(url, pattern):
                data = post_data

        return self._make_request(url, data)

    def request(self, url, data=None):

        return self._get_response(url, data=data)

    def _manage_scrapers(self, response):
        """
            Checks if some scraper is suited for data extraction on the current url.
            If so, gets the extractor object and delegate the scraping task
            to the scraper Object
        """
        scraped_urls = []

        for scraper in self.scrapers:

            urls = scraper.try_scrape(response)

            if urls is not None:

                self._commit()
                scraped_urls.extend(urls)

        return scraped_urls

    def _commit(self):
        """
            Makes a Commit in all sessions
        """

        for session in self.sessions:
            session.commit()

    def _search_in_urls_list(self, urls_list, url, default=True):
        """
            Searches an url in a list of urls
        """

        if not urls_list:
            return default

        for pattern in urls_list:
            if url_matcher(url, pattern):
                return True

        return False

    def _validate_url(self, url):
        """
            Validates if the url is in the crawler's [allowed_urls] list and not in [black_list].
        """

        return self._search_in_urls_list(self.allowed_urls, url) and not self._search_in_urls_list(self.black_list, url, default=False)

    def _fetch(self, url, depth_level=0):
        """
            Recursive url fetching.

            Params:
                depth_level: The maximun recursion level
                url: The url to start crawling
        """

        if not self._validate_url(url):
            return

        if self.debug:
            print "-" * 80
            print "crawling -> %s" % url

        try:
            response = self._get_response(url)
        except Exception, ex:
            self.on_request_error(url, ex)
            return

        if self.debug:
            print "-" * 80

        urls = self._manage_scrapers(response)

        if not urls:

            if self.search_all_urls:
                urls = self.get_urls(response)
            else:
                return

        for new_url in urls:

            if depth_level >= self.max_depth and self.max_depth != -1:
                return

            self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
Beispiel #5
0
class BaseCrawler(object):
    """
        User's Crawlers must inherit from this class, may
        override some methods and define the start_urls list,
        the scrapers and the max crawling depth.
    """

    __metaclass__ = CrawlerMeta

    start_urls = []
    """ A list containing the start urls for the crawler """

    allowed_urls = []
    """ A list of urls allowed for crawl """

    black_list = []
    """ A list of blocked urls which never be crawled """

    scrapers = []
    """ A list of scrapers classes """

    max_depth = -1
    """ The maximun crawling recursive level """

    max_concurrency_level = None
    """ The maximun coroutines concurrecy level """

    headers = {}
    """ The default request headers """

    requests_delay = config.REQUEST_DELAY
    """ The average delay time between requests """

    requests_deviation = config.REQUEST_DEVIATION
    """ The requests deviation time """

    extractor = None
    """ The extractor class. Default is XPathExtractor """

    post_urls = []
    """
        The Post data for the urls. A List of tuples containing (url, data_dict)
        Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'})
    """

    login = None
    """
        The login data. A tuple of (url, login_dict).
        Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
    """

    search_all_urls = True
    """
        If user doesn't define the get_urls method in scrapers then the crawler will search for urls
        in the current page itself depending on the [search_all_urls] attribute.
    """

    search_hidden_urls = False
    """
        Search for hidden urls in the whole html
    """
    def __init__(self, sessions=None, settings=None):
        """
            Initializes the crawler

            params:

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True)
        self.settings = settings

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()

        pool_type = getattr(settings, 'POOL', 'greenlets')
        pool = Pools[pool_type]

        if self.max_concurrency_level is None:
            self.max_concurrency_level = pool['max_concurrency']

        self.pool = pool['pool'](self.max_concurrency_level)
        self.request_manager = RequestManager(
            settings=settings,
            headers=self.headers,
            delay=self.requests_delay,
            deviation=self.requests_deviation)

        self._initialize_scrapers()

    def _initialize_scrapers(self):
        """
            Instanciates all the scraper classes
        """

        self.scrapers = [
            scraper_class(settings=self.settings)
            for scraper_class in self.scrapers
        ]

    def _make_request(self, url, data=None):
        """
            Returns the response object from a request

            params:
                data: if this param is present it makes a POST.
        """
        return self.request_manager.make_request(url, data, self.extractor)

    def _get_response(self, url, data=None):
        """
            Returns the response data from a request

            params:
                data: if this param is present it makes a POST.
        """

        for pattern, post_data in self.post_urls:
            if url_matcher(url, pattern):
                data = post_data

        return self._make_request(url, data)

    def request(self, url, data=None):

        return self._get_response(url, data=data)

    def _manage_scrapers(self, response):
        """
            Checks if some scraper is suited for data extraction on the current url.
            If so, gets the extractor object and delegate the scraping task
            to the scraper Object
        """
        scraped_urls = []

        for scraper in self.scrapers:

            urls = scraper.try_scrape(response)

            if urls is not None:

                self._commit()
                scraped_urls.extend(urls)

        return scraped_urls

    def _commit(self):
        """
            Makes a Commit in all sessions
        """

        for session in self.sessions:
            session.commit()

    def _search_in_urls_list(self, urls_list, url, default=True):
        """
            Searches an url in a list of urls
        """

        if not urls_list:
            return default

        for pattern in urls_list:
            if url_matcher(url, pattern):
                return True

        return False

    def _validate_url(self, url):
        """
            Validates if the url is in the crawler's [allowed_urls] list and not in [black_list].
        """

        return self._search_in_urls_list(
            self.allowed_urls, url) and not self._search_in_urls_list(
                self.black_list, url, default=False)

    def _fetch(self, url, depth_level=0):
        """
            Recursive url fetching.

            Params:
                depth_level: The maximun recursion level
                url: The url to start crawling
        """

        if not self._validate_url(url):
            return

        if self.debug:
            print "-" * 80
            print "crawling -> %s" % url

        try:
            response = self._get_response(url)
        except Exception, ex:
            self.on_request_error(url, ex)
            return

        if self.debug:
            print "-" * 80

        urls = self._manage_scrapers(response)

        if not urls:

            if self.search_all_urls:
                urls = self.get_urls(response)
            else:
                return

        for new_url in urls:

            if depth_level >= self.max_depth and self.max_depth != -1:
                return

            self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
Beispiel #6
0
class BaseCrawler(object):
    """
        User's Crawlers must inherit from this class, may
        override some methods and define the start_urls list,
        the scrapers and the max crawling depth.
    """

    __metaclass__ = CrawlerMeta

    start_urls = []
    """ A list containing the start urls for the crawler """

    allowed_urls = []
    """ A list of urls allowed for crawl """
    
    black_list = []
    """ A list of blocked urls which never be crawled """

    scrapers = []
    """ A list of scrapers classes """

    max_depth = -1
    """ The maximun crawling recursive level """

    extractor = None
    """ The extractor class. Default is XPathExtractor """

    post_urls = []
    """ 
        The Post data for the urls. A List of tuples containing (url, data_dict)
        Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'})
    """

    login = None
    """ 
        The login data. A tuple of (url, login_dict).
        Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
    """
    
    search_all_urls = True
    """
        If user doesn't define the get_urls method in scrapers then the crawler will search for urls
        in the current page itself depending on the [search_all_urls] attribute.
    """

    _url_regex = re_compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')

    def __init__(self, sessions=None, debug=False):
        """
            Initializes the crawler

            params:

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """        

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = debug

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()        

        self.pool = GreenPool()
        self.request_manager = RequestManager()
        
        self._initialize_scrapers()
        
    def _initialize_scrapers(self):
        """
            Instanciates all the scraper classes
        """
        
        self.scrapers = [scraper_class(debug=self.debug) for scraper_class in self.scrapers]            

    def _make_request(self, url, data=None):
        """
            Returns the response object from a request

            params:
                data: if this param is present it makes a POST.
        """
        return self.request_manager.make_request(url, data, self.extractor)

    def _get_response(self, url, data=None):
        """
            Returns the response data from a request

            params:
                data: if this param is present it makes a POST.
        """

        for pattern, post_data in self.post_urls:
            if url_matcher(url, pattern):
                data = post_data

        return self._make_request(url, data)

    def _manage_scrapers(self, response):
        """
            Checks if some scraper is suited for data extraction on the current url.
            If so, gets the extractor object and delegate the scraping task
            to the scraper Object
        """
        scraped_urls = []

        for scraper in self.scrapers:

            urls = scraper.try_scrape(response)
                
            if urls is not None:
                
                self._commit()
                scraped_urls.extend(urls)

        return scraped_urls

    def _commit(self):
        """
            Makes a Commit in all sessions
        """

        for session in self.sessions:
            session.commit()
    
    def _search_in_urls_list(self, urls_list, url, default=True):
        """
            Searches an url in a list of urls
        """
        
        if not urls_list:
            return default
        
        for pattern in urls_list:
            if url_matcher(url, pattern):
                return True
                
        return False
    
    def _validate_url(self, url):
        """
            Validates if the url is in the crawler's [allowed_urls] list and not in [black_list].
        """

        return self._search_in_urls_list(self.allowed_urls, url) and not self._search_in_urls_list(self.black_list, url, default=False)

    def _fetch(self, url, depth_level=0):
        """
            Recursive url fetching.

            Params:
                depth_level: The maximun recursion level
                url: The url to start crawling
        """

        if not self._validate_url(url):
            return

        if self.debug:
            print "-" * 80
            print "crawling -> %s" % url        
        
        try:
            response = self._get_response(url)
        except Exception, e:
            if self.debug:
                print "Request to %s returned error: %s" % (url, e)
            return
        
        if self.debug:
            print "-" * 80
                
        urls = self._manage_scrapers(response)
        
        if not urls:                        
            
            if self.search_all_urls:
                urls = self.get_urls(response)
            else:
                return 
        
        for new_url in urls:            

            if depth_level >= self.max_depth and self.max_depth != -1:
                return

            self.pool.spawn_n(self._fetch, new_url, depth_level + 1)