class Monitor(object): """ Monitor is main class representing web change monitor. It serves as factory for creating MonitoredResource objects. Usage: >>> from rrslib.web.changemonitor import Monitor >>> monitor = Monitor(user_id="rrs_university") >>> resource = monitor.get("http://www.google.com") >>> # if the page changed >>> if resource.check(): >>> print res.get_diff(start='last', end='now') """ def __init__(self, user_id, db_host="localhost", db_port=27017, db_name="webarchive", http_proxy=None): """ Create a new monitor connected to MongoDB at *db_host:db_port* using database db_name. @param user_id: identification string of user/module who uses monitor. If user_id is given None, the monitor switches to `global-view` mode and all requests to storage don't care about >>who checked this resource<<. On the other hand, if user_id is given a string, the monitor switches to `user-view` mode and all operations are oriented to the user. Most of the reasonable use cases are using user_id, because a user/module almost everytime ask about >>what changed since I have been here for the last time<<, not >>what changed since somebody has been here for the last time<<... @type user_id: str or None @param db_host: (optional) hostname or IP address of the instance to connect to, or a mongodb URI, or a list of hostnames / mongodb URIs. If db_host` is an IPv6 literal it must be enclosed in '[' and ']' characters following the RFC2732 URL syntax (e.g. '[::1]' for localhost) @param db_port: (optional) port number on which to connect @type db_port: int @param db_name: name of database which is used to store information about monitored documents and their versions. @type db_name: str @param http_proxy: (FUTURE USE) proxy server where to send requests @type http_proxy: unknown """ if not isinstance(user_id, basestring) and user_id is not None: raise TypeError("User ID has to be type str or None.") # save user id self._user_id = user_id # for future use if http_proxy is not None: raise NotImplementedError("HTTP proxy not supported yet.") # initialize models self._init_models(db_host, db_port, db_name, user_id) def _init_models(self, host, port, db, uid): self._conn = Connection(host, port) self._storage = Storage(self._conn, uid, db) self._dbname = db self._dbport = port self._dbhost = host def get(self, url): """ Creates new MonitoredResource instance which represents document on *url*. @param url: URL of monitored resource @type url: str @returns: monitored resource object bound to URL *url*. @rtype: MonitoredResource Design pattern: factory method. """ # test the url validity parse_result = urlparse(url) if parse_result.netloc == '': raise ValueError("URL '%s' is not properly formatted: missing netloc." % url) if parse_result.scheme == '': raise ValueError("URL '%s' is not properly formatted: missing scheme." % url) # return monitored resource object return MonitoredResource(parse_result.geturl(), self._user_id, self._storage) def allow_large_documents(self): """ Allow large objects to be stored in the storage. Large document is defined as file larger than 4096KB. Tis constant is defined in this module named as LARGE_DOCUMENT_SIZE representing size of the file in kilobytes. """ try: # just delegate to storage model self._storage.allow_large_documents() except AttributeError: raise RuntimeError("Models arent initialized. Something went to hell...") def check_uid(self): """ Check if user id given in constructor is a valid user id within the Monitor storage system. If the UID is occupied, returns False, True otherwise. If user_id is None, an exception UidError is raised. @returns: True if the UID is free @rtype: bool """ if self._user_id is None: raise UidError("Cannot check uid=None. Monitor is switched to global-view mode.") return self._storage.check_uid() def check_multi(self, urls=[]): """ Check list of urls, start new thread for each one. @param urls: @type urls: list @returns: list of MonitoredResource objects, each with actual data @rtype: list<MonitoredResource> """ # TODO: zkontrolovat, jestli vsechny prvky v urls jsou validni URL adresy raise NotSupportedYet() def __repr__(self): return "Monitor(conn=%s, dbname='%s', uid='%s')" % \ (self._conn.connection, self._dbname, self._user_id) __str__ = __repr__