def __init__(self, repository, name=None, initial_values = ()): """Initialize an instance. :param repository: the repository we're running in :type repository: ``repository.Repository`` :param name: unique ID to use as the collection name. If not specified, one is generated. :type name: string :param initial_values: a sequence of values to start off the collection with. Defaults to empty list. :type initial_values: sequence of either Document or Collection instances """ self.__xitems = {} # maps id or name to element. Elements may be documents or other collections. self.scantime = repository.mod_time() # last time looked at self.repository = repository self.id = name or create_new_id() self.storage_path = os.path.join(repository.collections_folder(), self.id) if os.path.exists(self.storage_path): try: fp = open(self.storage_path, 'rb') self.load(fp) fp.close() except: type, value, tb = sys.exc_info() note(2, "Couldn't load collection %s:\n%s", self.id, ''.join(traceback.format_exception(type, value, tb))) elif initial_values: for item in initial_values: if isinstance(item, Document): self.__xitems[item.id] = DocumentPointer(item) elif isinstance(item, Collection): self.__xitems[item.name()] = CollectionPointer(item.id, item)
def _scan_rss_sites(repo): global _ADDED_SITES, _REMOVED_SITES try: from uplib.plibUtil import configurator, note, write_metadata, id_to_time, create_new_id from uplib.extensions import find_and_load_extension conf = configurator.default_configurator() if repo: sys_inits_path = os.path.join(conf.get('uplib-lib'), 'site-extensions') repo_inits_path = os.path.join(repo.root(), "overhead", "extensions", "active") upload_m = find_and_load_extension("UploadDocument", "%s|%s" % (repo_inits_path, sys_inits_path), None, True) if not upload_m: note(0, "Can't load UploadDocument extension!") sys.exit(1) else: note("UploadDocument extension is %s", upload_m) scan_period = conf.get_int("rss-scan-period", 60 * 2) startup_delay = conf.get_int("rss-startup-delay", 0) del conf import feedparser if startup_delay > 0: note(3, "startup delay is %d", startup_delay) time.sleep(startup_delay) except: note(0, "RSSReader: exception starting RSS scan thread:\n%s", ''.join(traceback.format_exception(*sys.exc_info()))) return rss_sites = -1 while True: try: conf = configurator() # re-read uplibrc file old_rss_sites = rss_sites rss_sites = conf.get("rss-sites") if old_rss_sites == -1 or (old_rss_sites != rss_sites): note(2, "rss_sites are %s", rss_sites) scan_period = conf.get_int("rss-scan-period", scan_period) expiration_period = conf.get_int("rss-expiration-period", 30 * 24 * 60 * 60) # 30 days if rss_sites: rss_sites = rss_sites.split() + _ADDED_SITES else: rss_sites = _ADDED_SITES[:] if rss_sites: for site in _REMOVED_SITES: if site in rss_sites: rss_sites.remove(site) if rss_sites: feeds = [] for site in rss_sites: if site.startswith("feed:"): feeds.append(feedparser.parse(site)) elif site.startswith("http:") or site.startswith("https:"): feeds += find_feeds(site) note("feeds are:\n%s", [(x.feed.title, x.href, len(x.entries)) for x in feeds]) for feed in feeds: note("RSSReader: %s: %s entries in feed %s", time.ctime(), len(feed.entries), feed.feed.title) for entry in feed.entries: d = process_entry(entry) if not d: continue id = d.get("rss-id") hits = repo.do_query('+rss-id:"%s"' % id) if hits: # already in repo continue if repo: response = FakeResponse(repo) mdoutput = StringIO.StringIO() write_metadata(mdoutput, d) md = mdoutput.getvalue() mdoutput.close() upload_m.add(repo, response, { 'URL': d.get("original-url"), 'wait': "true", 'no-redirect': "true", 'metadata': md, 'md-categories': "RSSReader/%s" % feed.feed.title, }) if response.thread: while response.thread.isAlive(): response.thread.join(1.0) note("RSSReader: %s: %s (%s: %s)", time.ctime(), repr(d.get("title")), response.code, response.message) else: note("RSSReader: %s: %s (%s)\n %s", time.ctime(), repr(d.get("title")), d.get("date"), d.get("summary")) # now do expiries old_id = create_new_id(time.time() - expiration_period)[:-5] hits = repo.do_query("categories:RSSReader AND id:[00000-00-0000-000 TO %s] AND NOT categories:RSSReader/_noexpire_" % old_id) for score, doc in hits: # check to see if the user has looked at it if os.path.exists(os.path.join(doc.folder(), "activity")): doc.add_category("RSSReader/_noexpire_", True) # and if not, remove it else: repo.delete_document(doc.id) time.sleep(scan_period) except KeyboardInterrupt: if _IGNORE_KEYBOARD_INTERRUPTS: note(0, "RSSReader: %s", ''.join(traceback.format_exception(*sys.exc_info()))) else: sys.exit(0) except: note(0, "RSSReader: %s", ''.join(traceback.format_exception(*sys.exc_info())))