コード例 #1
0
class Monitor(object):
    """
    Monitor is main class representing web change monitor. It serves
    as factory for creating MonitoredResource objects.

    Usage:
        >>> from rrslib.web.changemonitor import Monitor
        >>> monitor = Monitor(user_id="rrs_university")
        >>> resource = monitor.get("http://www.google.com")
        >>> # if the page changed
        >>> if resource.check():
        >>>     print res.get_diff(start='last', end='now')
    """
    def __init__(self, user_id, db_host="localhost", db_port=27017, db_name="webarchive", http_proxy=None):
        """
        Create a new monitor connected to MongoDB at *db_host:db_port* using
        database db_name.

        @param user_id: identification string of user/module who uses monitor.
                        If user_id is given None, the monitor switches to
                        `global-view` mode and all requests to storage don't
                        care about >>who checked this resource<<. On the other
                        hand, if user_id is given a string, the monitor switches
                        to `user-view` mode and all operations are oriented
                        to the user. Most of the reasonable use cases are
                        using user_id, because a user/module almost everytime
                        ask about >>what changed since I have been here for the
                        last time<<, not >>what changed since somebody has been
                        here for the last time<<...
        @type user_id: str or None
        @param db_host: (optional) hostname or IP address of the instance
                        to connect to, or a mongodb URI, or a list of
                        hostnames / mongodb URIs. If db_host` is an IPv6 literal
                        it must be enclosed in '[' and ']' characters following
                        the RFC2732 URL syntax (e.g. '[::1]' for localhost)
        @param db_port: (optional) port number on which to connect
        @type db_port: int
        @param db_name: name of database which is used to store information about
                        monitored documents and their versions.
        @type db_name: str
        @param http_proxy: (FUTURE USE) proxy server where to send requests
        @type http_proxy: unknown
        """
        if not isinstance(user_id, basestring) and user_id is not None:
            raise TypeError("User ID has to be type str or None.")
        # save user id
        self._user_id = user_id
        # for future use
        if http_proxy is not None:
            raise NotImplementedError("HTTP proxy not supported yet.")
        # initialize models
        self._init_models(db_host, db_port, db_name, user_id)


    def _init_models(self, host, port, db, uid):
        self._conn = Connection(host, port)
        self._storage = Storage(self._conn, uid, db)
        self._dbname = db
        self._dbport = port
        self._dbhost = host
        

    def get(self, url):
        """
        Creates new MonitoredResource instance which represents document on
        *url*.
        
        @param url: URL of monitored resource
        @type url: str
        @returns: monitored resource object bound to URL *url*.
        @rtype: MonitoredResource
        
        Design pattern: factory method.
        """
        # test the url validity
        parse_result = urlparse(url)
        if parse_result.netloc == '':
            raise ValueError("URL '%s' is not properly formatted: missing netloc." % url)
        if parse_result.scheme == '':
            raise ValueError("URL '%s' is not properly formatted: missing scheme." % url)
        # return monitored resource object
        return MonitoredResource(parse_result.geturl(), self._user_id, self._storage)


    def allow_large_documents(self):
        """
        Allow large objects to be stored in the storage. Large document is
        defined as file larger than 4096KB. Tis constant is defined in this
        module named as LARGE_DOCUMENT_SIZE representing size of the file
        in kilobytes.
        """
        try:
            # just delegate to storage model
            self._storage.allow_large_documents()
        except AttributeError:
            raise RuntimeError("Models arent initialized. Something went to hell...")
        

    def check_uid(self):
        """
        Check if user id given in constructor is a valid user id within
        the Monitor storage system. If the UID is occupied, returns False,
        True otherwise.

        If user_id is None, an exception UidError is raised.
        
        @returns: True if the UID is free
        @rtype: bool
        """
        if self._user_id is None:
            raise UidError("Cannot check uid=None. Monitor is switched to global-view mode.")
        return self._storage.check_uid()


    def check_multi(self, urls=[]):
        """
        Check list of urls, start new thread for each one.
        @param urls:
        @type urls: list
        @returns: list of MonitoredResource objects, each with actual data
        @rtype: list<MonitoredResource>
        """
        # TODO: zkontrolovat, jestli vsechny prvky v urls jsou validni URL adresy
        raise NotSupportedYet()


    def __repr__(self):
        return "Monitor(conn=%s, dbname='%s', uid='%s')" % \
            (self._conn.connection, self._dbname, self._user_id)


    __str__ = __repr__