def __init__(self, start_link):
     self.start_link = start_link
     self.page_getter = PageGetter()
     self.links = []
     self.product_tabs = [
         'all', 'characteristics', 'review', 'photo', 'comments'
     ]
     self.curr_path_manager = None
Beispiel #2
0
class AWSpider:

    """
    Amazon S3, Amazon EC2 based web spider.
    """
    
    start_deferred = None
    
    paused = False
    plugins = []
    start_time = time.time()
    
    public_ip = None
    local_ip = None
    network_information = {}
    
    uuid = uuid4().hex
    reserved_arguments = ["reservation_function_name", "reservation_created", "reservation_next_request", "reservation_error"]
    exposed_function_resources = {}
    functions = {}
    exposed_functions = []
    peers = {}
    peer_uuids = []
    
    queryloop = None
    coordinateloop = None
    
    site = None
    web_admin_site = None
    site_port = None
    web_admin_site_port = None
    
    shutdown_trigger_id = None
    
    logging_handler = None
    job_limit = 2500
    
    uuid_limits = {'start':None, 'end':None}
    active_jobs = {}
    
    def __init__( self, 
                aws_access_key_id, 
                aws_secret_access_key, 
                aws_s3_cache_bucket, 
                aws_sdb_reservation_domain, 
                aws_s3_storage_bucket=None,
                aws_sdb_coordination_domain=None, 
                port=8080, 
                max_simultaneous_requests=50,
                max_requests_per_host_per_second=1,
                max_simultaneous_requests_per_host=5,
                log_directory=None, 
                log_level="debug",
                web_admin=True,
                web_admin_port=None,
                time_offset=None,
                peer_check_interval=30,
                reservation_check_interval=30,
                hammer_prevention=False):
        
        self.start_deferred = Deferred()
        
        self.hammer_prevention = hammer_prevention
        
        self.time_offset = time_offset
        self.peer_check_interval = int(peer_check_interval)
        self.reservation_check_interval = int(reservation_check_interval)
        
        self.port = int(port)
        self.network_information["port"] = self.port
        
        self.rq = RequestQueuer( 
            max_simultaneous_requests=int(max_simultaneous_requests), 
            max_requests_per_host_per_second=int(max_requests_per_host_per_second), 
            max_simultaneous_requests_per_host=int(max_simultaneous_requests_per_host)
        )
        
        self.rq.setHostMaxRequestsPerSecond("127.0.0.1", 0)
        self.rq.setHostMaxSimultaneousRequests("127.0.0.1", 0)
        
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.aws_s3_cache_bucket = aws_s3_cache_bucket
        self.aws_s3_storage_bucket = aws_s3_storage_bucket
        self.aws_sdb_reservation_domain = aws_sdb_reservation_domain
        self.aws_sdb_coordination_domain = aws_sdb_coordination_domain
        
        self.resource = Resource()
        
        if web_admin_port is None:
            web_admin_port = self.port
        else:
            web_admin_port = int(web_admin_port)

        if web_admin:
            self.resource.putChild('', TemplateResource("index" ) )
            self.resource.putChild('static', static.File( os.path.join(os.path.dirname(__file__), 'web_static') ) )
            self.resource.putChild('control', ControlResource( self ) )
          
        self.resource.putChild('data', DataResource( self ) )
        self.resource.putChild('peer', PeerResource( self ) )
        self.function_resource = Resource()
        self.resource.putChild( "function", self.function_resource )
        self.site = server.Site( self.resource )
        self.site_port = reactor.listenTCP(self.port, self.site)

        if web_admin and web_admin_port != self.port:
            self.web_admin_site = server.Site( self.resource )
            self.web_admin_site_port = reactor.listenTCP( web_admin_port, self.web_admin_site )

        self.s3 = AmazonS3( self.aws_access_key_id, self.aws_secret_access_key, self.rq )
        self.sdb = AmazonSDB( self.aws_access_key_id, self.aws_secret_access_key, self.rq )
        
        self.pg = PageGetter( self.s3, self.aws_s3_cache_bucket, rq=self.rq )

        if log_directory is None:
            self.logging_handler = logging.StreamHandler()
        else:
            self.logging_handler = logging.handlers.TimedRotatingFileHandler(os.path.join(log_directory, 'spider.log'), when='D', interval=1)
            
        formatter = logging.Formatter("%(levelname)s: %(message)s %(pathname)s:%(lineno)d")
        self.logging_handler.setFormatter(formatter)
        logger.addHandler(self.logging_handler)
        
        log_level = log_level.lower()
        log_levels = {
            "debug":logging.DEBUG, 
            "info":logging.INFO, 
            "warning":logging.WARNING, 
            "error":logging.ERROR,
            "critical":logging.CRITICAL
        }
        if log_level in log_levels:
            logger.setLevel(log_levels[log_level])
        else:
            logger.setLevel(logging.DEBUG)
    
        logger.info("Successfully loaded the Spider's configuration.")
    
    def setHostMaxRequestsPerSecond(self, host, requests_per_second):
        return self.rq.setHostMaxRequestsPerSecond(host, requests_per_second)
    
    def setHostMaxSimultaneousRequests(self, host, simultaneous_requests):
        return self.rq.setHostMaxSimultaneousRequests(host, simultaneous_requests)

    def clearStorage( self ):
        if self.aws_s3_storage_bucket is not None:
            return self.s3.emptyBucket( self.aws_s3_storage_bucket )
        raise Exception("aws_s3_storage_bucket is not defined.")
        
    def showReservation( self, uuid ):
        d = self.sdb.getAttributes( self.aws_sdb_reservation_domain, uuid )
        return d

    def getServerData( self ):    
        
        running_time = time.time() - self.start_time
        cost = (self.sdb.box_usage * .14) * (60*60*24*30.4) / (running_time)
        active_requests_by_host = self.rq.getActiveRequestsByHost()
        pending_requests_by_host = self.rq.getPendingRequestsByHost()
        
        data = {
            "paused":self.paused,
            "running_time":running_time,
            "cost":cost,
            "active_requests_by_host":active_requests_by_host,
            "pending_requests_by_host":pending_requests_by_host,
            "active_requests":self.rq.getActive(),
            "pending_requests":self.rq.getPending(),
            "current_timestamp":sdb_now(offset=self.time_offset)
        }
        
        logger.debug("Got server data:\n%s" % PrettyPrinter.pformat(data) )

        return data

    def getPage( self, *args, **kwargs ):
        if not self.hammer_prevention or len( self.peer_uuids ) == 0:
            return self.pg.getPage( *args, **kwargs )
        else:
            scheme, host, port, path = _parse( args[0] )
            peer_uuid = self.peer_uuids[ int( uuid5(NAMESPACE_DNS, host).int % len( self.peer_uuids ) ) ]
            if peer_uuid == self.uuid or self.peers[ peer_uuid ]["active"] == False:
                return self.pg.getPage( *args, **kwargs )
            else:
                
                parameters = {}
                parameters["url"] = args[0]
                if "method" in kwargs:
                    parameters["method"] = kwargs["method"]   
                if "postdata" in kwargs: 
                    parameters["postdata"] = urllib.urlencode( kwargs["postdata"] )
                if "headers" in kwargs: 
                    parameters["headers"] = urllib.urlencode( kwargs["headers"] )
                if "cookies" in kwargs: 
                    parameters["cookies"] = urllib.urlencode( kwargs["cookies"] )         
                if "agent" in kwargs:
                    parameters["agent"] = kwargs["agent"]
                if "timeout" in kwargs:
                    parameters["timeout"] = kwargs["timeout"]
                if "followRedirect" in kwargs:
                    parameters["followRedirect"] = kwargs["followRedirect"]
                if "url_hash" in kwargs: 
                    parameters["url_hash"] = kwargs["url_hash"]
                if "cache" in kwargs: 
                    parameters["cache"] = kwargs["cache"]
                if "prioritize" in kwargs: 
                    parameters["prioritize"] = kwargs["prioritize"]
                
                url = "%s/peer/getpage?%s" % ( self.peers[ peer_uuid ]["uri"], urllib.urlencode(parameters) )
                logger.debug( "Re-routing request for %s to %s" % (args[0], url) )
                d = self.rq.getPage(url)
                d.addErrback( self._getPageErrback, args, kwargs ) 
                return d
                
    def _getPageErrback( self, error, args, kwargs ):
        logger.error( args[0] + ":" + str(error) )
        return self.pg.getPage( *args, **kwargs )

    def getExposedFunctionDetails( self ):
        functions = []
        for key in self.exposed_functions:
            function = [key, {}]
            function[1]["interval"] = self.functions[key]["interval"]
            function[1]["required_arguments"] = self.functions[key]["required_arguments"]
            function[1]["optional_arguments"] = self.functions[key]["optional_arguments"]
            functions.append( function )
        
        functions.sort(lambda x,y:cmp(x[0], y[0]))
        
        logger.info("Got exposed function details:\n%s" % PrettyPrinter.pformat(functions) )
        
        return functions

    
    def createReservation( self, function_name, **kwargs ):
        
        if not isinstance( function_name, str ):
            for key in self.functions:
                if self.functions[key]["function"] == function_name:
                    function_name = key
                    break            
        
        if function_name not in self.functions:
            raise Exception("Function %s does not exist." % function_name )
    
        function = self.functions[ function_name ]
    
        filtered_kwargs = {}
        for key in function["required_arguments"]:
            if key in kwargs:
                filtered_kwargs[key] = convertToUTF8( kwargs[key] )
            else:
                raise Exception("Required parameter '%s' not found. Required parameters are %s. Optional parameters are %s." % (key, function["required_arguments"], function["optional_arguments"] ))

        for key in function["optional_arguments"]:
            if key in kwargs:
                filtered_kwargs[key] = convertToUTF8( kwargs[key] )
        
        if function["interval"] > 0:
            reserved_arguments = {}
            reserved_arguments["reservation_function_name"] = function_name
            reserved_arguments["reservation_created"] = sdb_now(offset=self.time_offset)
            reserved_arguments["reservation_next_request"] = reserved_arguments["reservation_created"]
            reserved_arguments["reservation_error"] = "0"
            
            all_arguments = {}
            all_arguments.update( reserved_arguments )
            all_arguments.update( filtered_kwargs )
            
            uuid = uuid4().hex
            logger.debug( "Creating reservation on SimpleDB for %s, %s." % (function_name, uuid))

            a = self.sdb.putAttributes( self.aws_sdb_reservation_domain, uuid, all_arguments )
            a.addCallback( self._createReservationCallback, function_name, uuid )
            a.addErrback( self._createReservationErrback, function_name, uuid )
                        
            if "call_immediately" in kwargs and not evaluateBoolean( kwargs["call_immediately"] ):    
                d = DeferredList([a], consumeErrors=True)
            else:
                logger.debug( "Calling %s immediately with arguments:\n%s" % (function_name, PrettyPrinter.pformat(filtered_kwargs) ) )
                b = self.callExposedFunctionImmediately( function["function"], filtered_kwargs, function_name )
                d = DeferredList([a,b], consumeErrors=True)
            
            d.addCallback( self._createReservationCallback2, function_name, uuid )
            d.addErrback( self._createReservationErrback2, function_name, uuid )
            return d
            
        else:
            logger.debug( "Calling %s immediately with arguments:\n%s" % (function_name, PrettyPrinter.pformat(filtered_kwargs) ) )
            d = self.callExposedFunctionImmediately( function["function"], filtered_kwargs, function_name )
            return d


    def _callExposedFunctionWrapper( self, data, func, kwargs, function_name, uuid ):
        return self.callExposedFunction( func, kwargs, function_name, uuid )

    def _createReservationCallback( self, data, function_name, uuid ):
        logger.debug( "Created reservation on SimpleDB for %s, %s." % (function_name, uuid))
        return uuid
    
    def _createReservationErrback( self, error, function_name, uuid ):
        logger.error( "Unable to create reservation on SimpleDB for %s:%s, %s.\n" % (function_name, uuid, error) )
        return error
    
    def _createReservationCallback2( self, data, function_name, uuid ):
        
        for row in data:
            if row[0] == False:
                raise row[1]
                
        if len(data) == 1:
            return {data[0][1]:{}}
        else:
            return {data[0][1]:data[1][1]}
            
    def _createReservationErrback2( self, error, function_name, uuid ):
        logger.error( "Unable to create reservation for %s:%s, %s.\n" % (function_name, uuid, error) )
        return error
    
    def callExposedFunctionImmediately( self, func, kwargs, function_name ):
        d = maybeDeferred( func, **kwargs )
        d.addCallback( self._callExposedFunctionImmediatelyCallback, function_name )
        d.addErrback( self._callExposedFunctionImmediatelyErrback, function_name )
        return d
        
    def _callExposedFunctionImmediatelyCallback( self, data, function_name ):
        logger.debug("Function %s returned successfully." % (function_name) )
        return data
        
    def _callExposedFunctionImmediatelyErrback( self, error, function_name ):
        logger.error( "Error with immediate callback of %s.\n%s\n%s" % (function_name, error, error.value.__dict__) )

        return error
            
    def expose( self, *args, **kwargs ):
        return self.makeCallable( expose=True, *args, **kwargs )
        
    def makeCallable( self, func, interval=0, name=None, expose=False ):
        
        argspec = inspect.getargspec( func )
        arguments = argspec[0]
        if len(arguments) > 0 and arguments[0:1][0] == 'self':
            arguments.pop(0)
        
        kwarg_defaults = argspec[3]
        if kwarg_defaults is None:
            kwarg_defaults = []
        
        required_arguments = arguments[ 0:len(arguments) - len(kwarg_defaults) ]
        optional_arguments = arguments[ len(arguments) - len(kwarg_defaults):]

        if hasattr(func, "im_class") and name is None:
            function_name = "%s/%s" % ( func.im_class.__name__, func.__name__)
        elif name is not None:
            function_name = name
        else:
            function_name = func.__name__
        
        function_name = function_name.lower()
        
        for key in required_arguments:
            if key in self.reserved_arguments:
                message = "Required argument name '%s' used in function %s is reserved." % (key, function_name)
                logger.error( message )
                raise Exception( message )
                
        for key in optional_arguments:
            if key in self.reserved_arguments:
                message = "Optional argument name '%s' used in function %s is reserved." % (key, function_name)
                logger.error( message )
                raise Exception( message )
        
        if function_name in self.functions:
            raise Exception("A method or function with the name %s is already callable." % function_name)
        
        self.functions[ function_name ] = {
            "function":func,
            "interval":interval,
            "required_arguments":required_arguments,
            "optional_arguments":optional_arguments
        }
        
        logger.info( "Function %s is now callable by the Spider." % function_name )
        
        if expose:
            
            self.exposed_functions.append( function_name )
            
            er = ExposedResource( self, function_name )
        
            function_name_parts = function_name.split("/")
            if len( function_name_parts ) > 1:
                if function_name_parts[0] in self.exposed_function_resources:
                    r = self.exposed_function_resources[ function_name_parts[0] ]
                else:
                    r = Resource()
                    self.exposed_function_resources[ function_name_parts[0] ] = r
                self.function_resource.putChild(function_name_parts[0], r )
                r.putChild( function_name_parts[1], er)
            else:
                self.function_resource.putChild(function_name_parts[0], er )
            
            logger.info( "Function %s is now available via the Spider's web interface." % function_name )
            
        return function_name
        
    def start( self ):
        reactor.callWhenRunning( self._start )
        return self.start_deferred
        
    def _start( self ):
        logger.critical( "Checking S3 and SimpleDB setup, getting time offset for sync." )
        
        deferreds = []
        
        # Make sure the specified S3 cache bucket works.
        
        deferreds.append( self.s3.checkAndCreateBucket(self.aws_s3_cache_bucket))

        if self.aws_s3_storage_bucket is not None:
            deferreds.append( self.s3.checkAndCreateBucket(self.aws_s3_storage_bucket))

        deferreds.append( self.sdb.checkAndCreateDomain(self.aws_sdb_reservation_domain))    

        if self.aws_sdb_coordination_domain is not None:
            deferreds.append( self.sdb.checkAndCreateDomain(self.aws_sdb_coordination_domain))
            deferreds.append( self.getNetworkAddress() )
        
        if self.time_offset is None:
            deferreds.append( self.getTimeOffset() )
        
        d = DeferredList(deferreds, consumeErrors=True )
        d.addCallback( self._startCallback )

    def _startCallback( self, data ):   
        
        for row in data:
            if row[0] == False:
                d = self.shutdown()
                d.addCallback( self._startHandleError, row[1] )
                return d
        
        self.shutdown_trigger_id = reactor.addSystemEventTrigger('before', 'shutdown', self.shutdown )  
        
        logger.critical( "Starting Spider UUID: %s" % self.uuid)
        self.queryloop = task.LoopingCall( self.query )
        self.queryloop.start( self.reservation_check_interval )
        if self.aws_sdb_coordination_domain is not None:
            self.coordinateloop = task.LoopingCall( self.coordinate )
            self.coordinateloop.start( self.peer_check_interval )  
            d = self.peerCheckRequest()
            if isinstance( d, Deferred ):
                d.addCallback( self._startCallback2 )
                return d
               
        self._startCallback2( None )
        
    def _startCallback2( self, data ):
        self.start_deferred.callback( True )
        
    def _startErrback( self, error ):
        return self._startHandleError( None, error )
    
    def _startHandleError( self, data, error ):
        self.start_deferred.errback( error )
   
    def getNetworkAddress( self ):
        d = getNetworkAddress()
        d.addCallback( self._getNetworkAddressCallback )
        d.addErrback( self._getNetworkAddressErrback )
        return d
                
    def _getNetworkAddressCallback( self, data  ):   
        if "public_ip" in data:
            self.public_ip = data["public_ip"]
            self.network_information["public_ip"] = self.public_ip
            
        if "local_ip" in data:
            self.local_ip = data["local_ip"]  
            self.network_information["local_ip"] = self.local_ip
         
    def _getNetworkAddressErrback( self, error ):
        message = "Could not get network address." 
        logger.critical( message )
        raise Exception( message )
            
    def getTimeOffset( self ):
        d = getTimeOffset()
        d.addCallback( self._getTimeOffsetCallback )
        d.addErrback( self._getTimeOffsetErrback )
        return d
        
    def _getTimeOffsetCallback( self, time_offset ):
        self.time_offset = time_offset
        logger.info( "Got time offset for sync: %s" % self.time_offset )
    
    def _getTimeOffsetErrback( self, error ):
        if self.time_offset is None:
            message = "Could not get time offset for sync."
            logger.critical( message )
            raise Exception( message )
    
    def pause(self):
        logger.critical( "Pausing Spider." )
        self.paused = True
        self.queryloop.stop() 
        self.coordinateloop.stop()
        
    def resume(self):
        logger.critical( "Resuming Spider." )
        self.paused = False
        self.queryloop.start( self.reservation_check_interval ) 
        if self.aws_sdb_coordination_domain is not None:
            self.coordinateloop.start( self.peer_check_interval)
    
    def setReservationError( self, uuid, function_name="Unknown" ):
        logger.error( "Setting error on reservation %s, %s." % (function_name, uuid) )
        d = self.sdb.putAttributes( self.aws_sdb_reservation_domain, uuid, {"reservation_error":"1"}, replace=["reservation_error"] )
        d.addCallback( self._setReservationErrorCallback, uuid )
        d.addErrback( self._setReservationErrorErrback, uuid )
        return d
        
    def _setReservationErrorCallback( self, data, function_name, uuid ):
        logger.error( "Error set on reservation %s, %s." % (function_name, uuid ) )
        
    def _setReservationErrorErrback( self, error, function_name, uuid ):
        logger.error( "Unable to set error on reservation %s, %s.\n%s" % (function_name, uuid, error) )
    
    def deleteFunctionReservations( self, function_name ):
        logger.info( "Deleting reservations for function %s." % (function_name) )
        d = self.sdb.select( "SELECT itemName() FROM `%s` WHERE reservation_function_name='%s'" % ( self.aws_sdb_reservation_domain, function_name) )
        d.addCallback( self._deleteFunctionReservationsCallback, function_name )
        d.addErrback( self._deleteFunctionReservationsErrback, function_name )
        return d
    
    def _deleteFunctionReservationsCallback( self, data, function_name ):
        logger.debug( "Found reservations for function %s: %s" % (function_name, data) )
        deferreds = []
        for uuid in data:
            deferreds.append( self.sdb.delete(self.aws_sdb_reservation_domain, uuid) )
        d = DeferredList(deferreds, consumeErrors = True )
        d.addCallback( self._deleteFunctionReservationsCallback2, function_name )
        return d
    
    def _deleteFunctionReservationsCallback2( self, data, function_name ):
        try:
            for row in data:
                if row[0] == False:
                    raise row[1] 
        except:
            return self._deleteFunctionReservationsErrback( Failure(exc_value=sys.exc_value, exc_type=sys.exc_type, exc_tb=sys.exc_traceback), function_name )
        
        logger.debug( "Deleted reservations for function %s." % (function_name) )
        
    def _deleteFunctionReservationsErrback( self, error, function_name ):
        logger.error( "Error deleting reservations for %s.\n%s" ) % (function_name, error)
        
    def deleteReservation( self, uuid, function_name="Unknown" ):
        logger.info("Deleting reservation %s, %s." % (function_name, uuid))
        deferreds = []
        deferreds.append(self.sdb.delete(self.aws_sdb_reservation_domain, uuid))
        deferreds.append(self.s3.deleteObject(self.aws_s3_storage_bucket, uuid))
        d = DeferredList(deferreds)
        d.addCallback(self._deleteReservationCallback, function_name, uuid)
        d.addErrback(self._deleteReservationErrback, function_name, uuid)
        return d
        
    def _deleteReservationCallback( self, data, function_name, uuid ):
        logger.info( "Reservation %s, %s successfully deleted." % (function_name, uuid) )
        return True
        
    def _deleteReservationErrback( self, error, function_name, uuid ):
        logger.error( "Error deleting reservation %s, %s.\n%s" ) % (function_name, uuid, error)
        return False
    
    def queryByUUID(self, uuid):
        sql = "SELECT * FROM `%s` WHERE itemName() = '%s'" % (self.aws_sdb_reservation_domain, uuid)
        #logger.debug( "Querying SimpleDB, \"%s\"" % sql )
        d = self.sdb.select( sql )
        d.addCallback( self._queryCallback2 )
        d.addErrback( self._queryErrback )        
        return d
        
    def query( self, data=None ):
        if self.uuid_limits["start"] is None and self.uuid_limits["end"] is not None:
            uuid_limit_clause = "AND itemName() < '%s'" % self.uuid_limits["end"]
        elif self.uuid_limits["start"] is not None and self.uuid_limits["end"] is None:
            uuid_limit_clause = "AND itemName() > '%s'" % self.uuid_limits["start"]
        elif self.uuid_limits["start"] is None and self.uuid_limits["end"] is None:
            uuid_limit_clause = ""
        else:
            uuid_limit_clause = "AND itemName() BETWEEN '%s' AND '%s'" % (self.uuid_limits["start"], self.uuid_limits["end"])
        sql = """SELECT itemName() 
                FROM `%s` 
                WHERE
                reservation_next_request < '%s'
                %s 
                LIMIT %s
                """ % (self.aws_sdb_reservation_domain, sdb_now(offset=self.time_offset), uuid_limit_clause, self.job_limit)
        #INTERSECTION reservation_error != '1'
        logger.debug( "Querying SimpleDB, \"%s\"" % sql )
        
        d = self.sdb.select( sql )
        d.addCallback( self._queryCallback )
        d.addErrback( self._queryErrback )
    
    def _queryErrback(self, error):
        logger.error( "Unable to query SimpleDB.\n%s" % error )
        
    def _queryCallback(self, data):
        deferreds = []
        keys = data.keys()
        if len(keys) > 0:
            i = 0
            while i * 20 < len(keys):
                key_subset = keys[i*20:(i+1)*20]
                i += 1
                sql = """SELECT *
                        FROM `%s` 
                        WHERE
                        itemName() in('%s')
                        """ % (self.aws_sdb_reservation_domain, "','".join(key_subset))
                d = self.sdb.select(sql)
                d.addCallback(self._queryCallback2)
                d.addErrback(self._queryErrback)
            
    def _queryCallback2(self, data):
        for uuid in data:
            
            if uuid in self.active_jobs:
                logger.error("Skipping %s" % uuid)
                continue
            
            kwargs_raw = {}
            reserved_arguments = {}
            
            for key in data[ uuid ]:
                if key in self.reserved_arguments:
                    reserved_arguments[ key ] = data[ uuid ][key][0]
                else:
                    kwargs_raw[key] = data[ uuid ][key][0]
            
            if "reservation_function_name" not in reserved_arguments:
                logger.error( "Reservation %s does not have a function name." % uuid )
                self.deleteReservation( uuid )
                continue

            function_name = reserved_arguments["reservation_function_name"]

            if function_name not in self.functions:
                logger.error( "Unable to process function %s for UUID: %s" % (function_name, uuid) )
                # self.deleteReservation( uuid, function_name=function_name )
                continue

            if "reservation_created" not in reserved_arguments:
                logger.error( "Reservation %s, %s does not have a created time." % (function_name, uuid) )
                self.deleteReservation( uuid, function_name=function_name )
                continue            
            
            if "reservation_next_request" not in reserved_arguments:
                logger.error( "Reservation %s, %s does not have a next request time." % (function_name, uuid) )
                self.deleteReservation( uuid, function_name=function_name )
                continue                

            if "reservation_error" not in reserved_arguments:
                logger.error( "Reservation %s, %s does not have an error flag." % (function_name, uuid) )
                self.deleteReservation( uuid, function_name=function_name )
                continue
            
            if reserved_arguments["reservation_function_name"] in self.functions:
                exposed_function = self.functions[ function_name  ]
            else:
                logger.error( "Could not find resource %s." % function_name )
                continue
            
            d = self.sdb.putAttributes( self.aws_sdb_reservation_domain, uuid, {"reservation_next_request":sdb_now_add(exposed_function["interval"], offset=self.time_offset) }, replace=["reservation_next_request"] )
            d.addCallback( self._setNextRequestCallback, function_name, uuid)
            d.addErrback( self._setNextRequestErrback, function_name, uuid )
            
            kwargs = {}
            for key in kwargs_raw:
                if key in exposed_function["required_arguments"]:
                    kwargs[ key ] = kwargs_raw[ key ]
                if key in exposed_function["optional_arguments"]:
                    kwargs[ key ] = kwargs_raw[ key ]
            
            has_reqiured_arguments = True
            for key in exposed_function["required_arguments"]:
                if key not in kwargs:
                    has_reqiured_arguments = False
                    logger.error( "%s, %s does not have required argument %s." % (function_name, uuid, key) ) 
                    
            if not has_reqiured_arguments:
                self.setReservationError( uuid )
                continue
            
            self.active_jobs[uuid] = True
            reactor.callInThread(self.callExposedFunction, exposed_function["function"], kwargs, function_name, uuid)
            #self.callExposedFunction(exposed_function["function"], kwargs, function_name, uuid)
            
        #self.callExposedFunction(exposed_function["function"], kwargs, function_name, uuid)
            
    def callExposedFunction( self, func, kwargs, function_name, uuid ):
        
        logger.debug( "Calling %s, %s." % (function_name, uuid) )
        
        try:
            d = func( **kwargs )
        except:         
            return self._exposedFunctionErrback( Failure(exc_value=sys.exc_value, exc_type=sys.exc_type, exc_tb=sys.exc_traceback), function_name, uuid )
        
        if isinstance(d, Deferred ):
            d.addCallback( self._exposedFunctionCallback, function_name, uuid )
            d.addErrback( self._exposedFunctionErrback, function_name, uuid )
            return d
        else:
            return self._exposedFunctionCallback( d, function_name, uuid )
            
    def _setNextRequestCallback(self, data, function_name, uuid ):
        logger.debug( "Set next request for %s, %s on on SimpleDB." % (function_name, uuid) )
        
    def _setNextRequestErrback(self, error, function_name, uuid ):
        logger.error( "Unable to set next request for %s, %s on SimpleDB.\n%s" % (function_name, uuid, error.value) )
    
    def get( self, uuid ):
        logger.debug( "Getting %s from S3." % uuid )
        d = self.s3.getObject( self.aws_s3_storage_bucket, uuid )
        d.addCallback( self._getCallback, uuid )
        d.addErrback( self._getErrback, uuid )
        return d
        
    def _getCallback( self, data, uuid ):
        logger.debug( "Got %s from S3." % (uuid) ) 
        return cPickle.loads( data["response"] )
    
    def _getErrback( self, error, uuid ):
        logger.error( "Could not get %s from S3.\n%s" % (uuid, error) ) 
        return error
    
    def _exposedFunctionCallback( self, data, function_name, uuid ):
        
        if self.aws_s3_storage_bucket is not None:
            if data is None:
                logger.debug( "Recieved None for %s, %s." % (function_name, uuid) )
                del self.active_jobs[uuid]
                return None
            else:    
                logger.debug( "Putting result for %s, %s on S3." % (function_name, uuid) )
                pickled_data = cPickle.dumps( data )
                d = self.s3.putObject( self.aws_s3_storage_bucket, uuid, pickled_data, content_type="text/plain", gzip=True )
                d.addErrback( self._exposedFunctionErrback2, function_name, uuid )
                d.addCallback( self._exposedFunctionCallback2, data, uuid )
                return d
    
    def _exposedFunctionCallback2( self, s3_callback_data, data, uuid ):
        del self.active_jobs[uuid]
        return data
    
    def _exposedFunctionErrback( self, error, function_name, uuid ):
        del self.active_jobs[uuid]
        try:
            error.raiseException()
        except DeleteReservationException, e:
            self.deleteReservation(uuid)
            message = """Error with %s, %s.\n%s            
            Reservation deleted at request of the function.""" % (
                function_name,
                uuid,
                error)
            logger.error(message)
            return
        except:
Beispiel #3
0
    def __init__( self, 
                aws_access_key_id, 
                aws_secret_access_key, 
                aws_s3_cache_bucket, 
                aws_sdb_reservation_domain, 
                aws_s3_storage_bucket=None,
                aws_sdb_coordination_domain=None, 
                port=8080, 
                max_simultaneous_requests=50,
                max_requests_per_host_per_second=1,
                max_simultaneous_requests_per_host=5,
                log_directory=None, 
                log_level="debug",
                web_admin=True,
                web_admin_port=None,
                time_offset=None,
                peer_check_interval=30,
                reservation_check_interval=30,
                hammer_prevention=False):
        
        self.start_deferred = Deferred()
        
        self.hammer_prevention = hammer_prevention
        
        self.time_offset = time_offset
        self.peer_check_interval = int(peer_check_interval)
        self.reservation_check_interval = int(reservation_check_interval)
        
        self.port = int(port)
        self.network_information["port"] = self.port
        
        self.rq = RequestQueuer( 
            max_simultaneous_requests=int(max_simultaneous_requests), 
            max_requests_per_host_per_second=int(max_requests_per_host_per_second), 
            max_simultaneous_requests_per_host=int(max_simultaneous_requests_per_host)
        )
        
        self.rq.setHostMaxRequestsPerSecond("127.0.0.1", 0)
        self.rq.setHostMaxSimultaneousRequests("127.0.0.1", 0)
        
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.aws_s3_cache_bucket = aws_s3_cache_bucket
        self.aws_s3_storage_bucket = aws_s3_storage_bucket
        self.aws_sdb_reservation_domain = aws_sdb_reservation_domain
        self.aws_sdb_coordination_domain = aws_sdb_coordination_domain
        
        self.resource = Resource()
        
        if web_admin_port is None:
            web_admin_port = self.port
        else:
            web_admin_port = int(web_admin_port)

        if web_admin:
            self.resource.putChild('', TemplateResource("index" ) )
            self.resource.putChild('static', static.File( os.path.join(os.path.dirname(__file__), 'web_static') ) )
            self.resource.putChild('control', ControlResource( self ) )
          
        self.resource.putChild('data', DataResource( self ) )
        self.resource.putChild('peer', PeerResource( self ) )
        self.function_resource = Resource()
        self.resource.putChild( "function", self.function_resource )
        self.site = server.Site( self.resource )
        self.site_port = reactor.listenTCP(self.port, self.site)

        if web_admin and web_admin_port != self.port:
            self.web_admin_site = server.Site( self.resource )
            self.web_admin_site_port = reactor.listenTCP( web_admin_port, self.web_admin_site )

        self.s3 = AmazonS3( self.aws_access_key_id, self.aws_secret_access_key, self.rq )
        self.sdb = AmazonSDB( self.aws_access_key_id, self.aws_secret_access_key, self.rq )
        
        self.pg = PageGetter( self.s3, self.aws_s3_cache_bucket, rq=self.rq )

        if log_directory is None:
            self.logging_handler = logging.StreamHandler()
        else:
            self.logging_handler = logging.handlers.TimedRotatingFileHandler(os.path.join(log_directory, 'spider.log'), when='D', interval=1)
            
        formatter = logging.Formatter("%(levelname)s: %(message)s %(pathname)s:%(lineno)d")
        self.logging_handler.setFormatter(formatter)
        logger.addHandler(self.logging_handler)
        
        log_level = log_level.lower()
        log_levels = {
            "debug":logging.DEBUG, 
            "info":logging.INFO, 
            "warning":logging.WARNING, 
            "error":logging.ERROR,
            "critical":logging.CRITICAL
        }
        if log_level in log_levels:
            logger.setLevel(log_levels[log_level])
        else:
            logger.setLevel(logging.DEBUG)
    
        logger.info("Successfully loaded the Spider's configuration.")
class Scraper:
    def __init__(self, start_link):
        self.start_link = start_link
        self.page_getter = PageGetter()
        self.links = []
        self.product_tabs = [
            'all', 'characteristics', 'review', 'photo', 'comments'
        ]
        self.curr_path_manager = None

    def crawl(self):
        self._crawl_by_categories([self.start_link])
        # self.curr_path_manager = PathManager(self.start_link, category=True)
        # self._caching_page()
        # links = Parser(self._read_cached_page()).scrap_top_category_links()
        # for link in links:
        #     self.scrap_product(link)

    def _crawl_by_categories(self, links, parent_dir=''):
        for link in links:
            self.curr_path_manager = PathManager(link, parent_dir=parent_dir)
            self._caching_page()
            new_links = Parser(self._read_cached_page()).parse_page()
            self._crawl_by_categories(new_links['categories'],
                                      self.curr_path_manager.category_dir)
            for product in new_links['products']:
                self.scrap_product(product,
                                   self.curr_path_manager.category_dir)

    def scrap_product(self, link, category_dir):
        for tab in self.product_tabs:
            tab_link = '{link}{tab}/'.format(link=link, tab=tab)
            self.curr_path_manager = PathManager(tab_link,
                                                 parent_dir=category_dir,
                                                 product_tab=tab)
            self._caching_page()
            result = Parser(self._read_cached_page()).scrap_product()
            self._save_res_file(result['all'])
            self._save_csv_file(result['characteristics'])

    def _caching_page(self):
        if not self._is_page_cached():
            page_source = self.page_getter.get_remote_page(
                self.curr_path_manager.get_link())
            self._save_page(page_source)

    def _is_page_cached(self):
        cached_page_path = self.curr_path_manager.get_cached_path()
        return os.path.isfile(cached_page_path) and os.stat(
            cached_page_path).st_size

    def _read_cached_page(self):
        return open(self.curr_path_manager.get_cached_path(), 'r').read()

    def _save_page(self, page_source):
        with open(self.curr_path_manager.get_cached_path(), 'w') as page:
            page.write(page_source)

    def _save_csv_file(self, result):
        self._save_res_file('\n'.join([','.join(row) for row in result]),
                            'csv')

    def _save_res_file(self, res, type_='html'):
        if res:
            with open(self.curr_path_manager.get_result_path(type_),
                      'w') as result_file:
                result_file.write(res)

    def scrap_characteristics_tab(self):
        pass