def __init__(self, **kwargs): """Initialize collection Redis building block.""" super(Collection, self).__init__(**kwargs) self.recs = RedisUnorderedList(self.RECS_KEY, self) self.lists = RedisOrderedList(self.LISTS_KEY, self) self.list_names = RedisNamedMap(self.LIST_NAMES_KEY, self, self.LIST_REDIR_KEY)
def __init__(self, **kwargs): super(Collection, self).__init__(**kwargs) self.recs = RedisUnorderedList(self.RECS_KEY, self) self.lists = RedisOrderedList(self.LISTS_KEY, self) self.list_names = RedisNamedMap(self.LIST_NAMES_KEY, self, self.LIST_REDIR_KEY)
def __init__(self, **kwargs): super(User, self).__init__(**kwargs) self.colls = RedisNamedMap(self.COLLS_KEY, self, self.COLLS_REDIR_KEY)
class User(RedisUniqueComponent): TEMP_PREFIX = 'temp-' MY_TYPE = 'user' INFO_KEY = 'u:{user}:info' ALL_KEYS = 'u:{user}:*' COLLS_KEY = 'u:{user}:colls' COLLS_REDIR_KEY = 'u:{user}:cr' MAX_ANON_SIZE = 1000000000 MAX_USER_SIZE = 5000000000 RATE_LIMIT_KEY = 'ipr:{ip}:{H}' URL_SKIP_KEY = 'us:{user}:s:{url}' SKIP_KEY_SECS = 330 SERIALIZE_PROPS = ['desc', 'display_url', 'full_name'] SERIALIZE_FULL_PROPS = SERIALIZE_PROPS + ['role', 'last_login', 'updated_at', 'created_at', 'timespan', 'size', 'max_size'] @classmethod def init_props(cls, config): cls.MAX_USER_SIZE = int(config['default_max_size']) cls.MAX_ANON_SIZE = int(config['default_max_anon_size']) cls.rate_limit_max = int(os.environ.get('RATE_LIMIT_MAX', 0)) cls.rate_limit_hours = int(os.environ.get('RATE_LIMIT_HOURS', 0)) cls.rate_limit_restricted_max = int(os.environ.get('RATE_LIMIT_RESTRICTED_MAX', cls.rate_limit_max)) cls.rate_limit_restricted_hours = int(os.environ.get('RATE_LIMIT_RESTRICTED_HOURS', cls.rate_limit_hours)) cls.rate_restricted_ips = os.environ.get('RATE_LIMIT_RESTRICTED_IPS', '').split(',') cls.URL_SKIP_KEY = config['skip_key_templ'] cls.SKIP_KEY_SECS = int(config['skip_key_secs']) cls.TEMP_PREFIX = config['temp_prefix'] def __init__(self, **kwargs): super(User, self).__init__(**kwargs) self.colls = RedisNamedMap(self.COLLS_KEY, self, self.COLLS_REDIR_KEY) @property def name(self): return self.my_id def create_new(self): max_size = self.redis.hget('h:defaults', 'max_size') if not max_size: max_size = self.MAX_USER_SIZE self.init_new(max_size) def init_new(self, max_size): self.data = {'max_size': max_size, 'size': 0} self._init_new() def _create_new_id(self): self.info_key = self.INFO_KEY.format_map({self.MY_TYPE: self.my_id}) return self.my_id def create_collection(self, coll_name, allow_dupe=False, **kwargs): coll_name = self.colls.reserve_obj_name(coll_name, allow_dupe=allow_dupe) collection = Collection(redis=self.redis, access=self.access) coll = collection.init_new(coll_name, **kwargs) self.colls.add_object(coll_name, collection, owner=True) return collection def has_collection(self, coll_name): return self.colls.name_to_id(coll_name) != None def get_collection_by_name(self, coll_name): coll = self.colls.name_to_id(coll_name) return self.get_collection_by_id(coll, coll_name) def get_collection_by_id(self, coll, coll_name): if not coll: return None collection = Collection(my_id=coll, name=coll_name, redis=self.redis, access=self.access) collection.owner = self return collection def get_collections(self, load=True): all_collections = self.colls.get_objects(Collection) collections = [] for collection in all_collections: collection.owner = self if self.access.can_read_coll(collection, allow_superuser=False): if load: collection.load() collections.append(collection) return collections def num_total_collections(self): return self.colls.num_objects() def move(self, collection, new_name, new_user): if self == new_user: return False new_name = new_user.colls.reserve_obj_name(new_name, allow_dupe=False) if not self.colls.remove_object(collection): return False new_user.colls.add_object(new_name, collection, owner=True) self.incr_size(-collection.size) new_user.incr_size(collection.size) Stats(self.redis).move_temp_to_user_usage(collection) for recording in collection.get_recordings(): # will be marked for commit recording.set_closed() return True def remove_collection(self, collection, delete=False): if not collection: return {'error': 'no_collection'} if not self.colls.remove_object(collection): return {'error': 'not_found'} self.incr_size(-collection.size) if delete: return collection.delete_me() return {} def delete_me(self): self.access.assert_is_curr_user(self) for collection in self.get_collections(load=False): collection.delete_me() return self.delete_object() def get_size_allotment(self): max_size = self.redis.hget(self.info_key, 'max_size') if max_size: return int(max_size) return self.MAX_USER_SIZE def get_size_remaining(self): size, max_size = self.redis.hmget(self.info_key, ['size', 'max_size']) rem = 0 try: if not size: size = 0 if not max_size: max_size = self.MAX_USER_SIZE max_size = int(max_size) size = int(size) rem = max_size - size except Exception as e: print(e) return rem def is_out_of_space(self): self.access.assert_is_curr_user(self) return self.get_size_remaining() <= 0 def mark_skip_url(self, url): key = self.URL_SKIP_KEY.format(user=self.my_id, url=url) r = self.redis.setex(key, self.SKIP_KEY_SECS, 1) def is_anon(self): return self.name.startswith('temp-') def get_space_usage(self): total = self.get_size_allotment() avail = self.get_size_remaining() data = { 'total': total, 'used': total - avail, 'available': avail, } return data def serialize(self, include_colls=False): full = self.access.is_logged_in_user(self) or self.access.is_superuser() all_data = super(User, self).serialize(include_duration=full) data = {'username': self.name} allowed_props = self.SERIALIZE_PROPS if not full else self.SERIALIZE_FULL_PROPS for prop in allowed_props: if prop in all_data: data[prop] = all_data[prop] colls = self.get_collections() data['num_collections'] = len(colls) if include_colls: data['collections'] = [coll.serialize( include_recordings=False, include_pages=False, include_lists=False) for coll in colls] # if not owner or superuser, return here, otherwise add additional properties if not full: return data data['space_utilization'] = self.get_space_usage() if self.is_anon(): data['anon'] = True data['role'] = 'anon' data['ttl'] = self.access.get_anon_ttl() collection = self.get_collection_by_name('temp') if collection: data['num_recordings'] = collection.num_recordings() else: data['anon'] = False data['role'] = self['role'] last_login = self.get_prop('last_login') if last_login: data['last_login'] = self.to_iso_date(last_login) return data def update_last_login(self): self.set_prop('last_login', int(datetime.utcnow().timestamp())) def __eq__(self, obj): if obj and (self.my_id == obj.my_id) and isinstance(obj, User): return True else: return False @property def curr_role(self): return self['role'] def is_rate_limited(self, ip): if not self.rate_limit_hours or not self.rate_limit_max: return None if self.access.is_superuser(): return None if self.curr_role == 'rate-unlimited-archivist': return None rate_key = self.RATE_LIMIT_KEY.format(ip=ip, H='') h = int(datetime.utcnow().strftime('%H')) if ip in self.rate_restricted_ips: limit_hours = self.rate_limit_restricted_hours limit_max = self.rate_limit_restricted_max else: limit_hours = self.rate_limit_hours limit_max = self.rate_limit_max rate_keys = [rate_key + '%02d' % ((h - i) % 24) for i in range(0, limit_hours)] values = self.redis.mget(rate_keys) total = sum(int(v) for v in values if v) return (total >= limit_max) def get_user_temp_warc_path(self): return os.path.join(os.environ['RECORD_ROOT'], self.name) def is_owner(self, owner): return self == owner
class User(RedisUniqueComponent): TEMP_PREFIX = 'temp-' MY_TYPE = 'user' INFO_KEY = 'u:{user}:info' ALL_KEYS = 'u:{user}:*' COLLS_KEY = 'u:{user}:colls' COLLS_REDIR_KEY = 'u:{user}:cr' MAX_ANON_SIZE = 1000000000 MAX_USER_SIZE = 5000000000 RATE_LIMIT_KEY = 'ipr:{ip}:{H}' URL_SKIP_KEY = 'us:{user}:s:{url}' SKIP_KEY_SECS = 330 SERIALIZE_PROPS = ['desc', 'display_url', 'full_name'] SERIALIZE_FULL_PROPS = SERIALIZE_PROPS + ['customer_id', 'customer_max_size', 'email_addr', 'role', 'last_login', 'updated_at', 'created_at', 'timespan', 'size', 'max_size'] @classmethod def init_props(cls, config): cls.MAX_USER_SIZE = int(config['default_max_size']) cls.MAX_ANON_SIZE = int(config['default_max_anon_size']) cls.rate_limit_max = int(os.environ.get('RATE_LIMIT_MAX', 0)) cls.rate_limit_hours = int(os.environ.get('RATE_LIMIT_HOURS', 0)) cls.rate_limit_restricted_max = int(os.environ.get('RATE_LIMIT_RESTRICTED_MAX', cls.rate_limit_max)) cls.rate_limit_restricted_hours = int(os.environ.get('RATE_LIMIT_RESTRICTED_HOURS', cls.rate_limit_hours)) cls.rate_restricted_ips = os.environ.get('RATE_LIMIT_RESTRICTED_IPS', '').split(',') cls.URL_SKIP_KEY = config['skip_key_templ'] cls.SKIP_KEY_SECS = int(config['skip_key_secs']) cls.TEMP_PREFIX = config['temp_prefix'] def __init__(self, **kwargs): super(User, self).__init__(**kwargs) self.colls = RedisNamedMap(self.COLLS_KEY, self, self.COLLS_REDIR_KEY) @property def name(self): return self.my_id def create_new(self): max_size = self.redis.hget('h:defaults', 'max_size') if not max_size: max_size = self.MAX_USER_SIZE self.init_new(max_size) def init_new(self, max_size): self.data = {'max_size': max_size, 'size': 0} self._init_new() def _create_new_id(self): self.info_key = self.INFO_KEY.format_map({self.MY_TYPE: self.my_id}) return self.my_id def create_collection(self, coll_name, allow_dupe=False, **kwargs): coll_name = self.colls.reserve_obj_name(coll_name, allow_dupe=allow_dupe) collection = Collection(redis=self.redis, access=self.access) coll = collection.init_new(coll_name, **kwargs) self.colls.add_object(coll_name, collection, owner=True) return collection def has_collection(self, coll_name): return self.colls.name_to_id(coll_name) != None def get_collection_by_name(self, coll_name): coll = self.colls.name_to_id(coll_name) return self.get_collection_by_id(coll, coll_name) def get_collection_by_id(self, coll, coll_name): if not coll: return None collection = Collection(my_id=coll, name=coll_name, redis=self.redis, access=self.access) collection.owner = self return collection def get_collections(self, load=True): all_collections = self.colls.get_objects(Collection) collections = [] for collection in all_collections: collection.owner = self if self.access.can_read_coll(collection, allow_superuser=True): if load: collection.load() collections.append(collection) return collections def num_total_collections(self): return self.colls.num_objects() def move(self, collection, new_name, new_user): if self == new_user: return False new_name = new_user.colls.reserve_obj_name(new_name, allow_dupe=False) if not self.colls.remove_object(collection): return False new_user.colls.add_object(new_name, collection, owner=True) self.incr_size(-collection.size) new_user.incr_size(collection.size) Stats(self.redis).move_temp_to_user_usage(collection) for recording in collection.get_recordings(): # will be marked for commit recording.set_closed() return True def remove_collection(self, collection, delete=False): if not collection: return {'error': 'no_collection'} if not self.colls.remove_object(collection): return {'error': 'not_found'} self.incr_size(-collection.size) if delete: return collection.delete_me() return {} def delete_me(self): self.access.assert_is_curr_user(self) for collection in self.get_collections(load=False): collection.delete_me() return self.delete_object() def get_size_allotment(self): max_size = self.redis.hget(self.info_key, 'max_size') if max_size: return int(max_size) return self.MAX_USER_SIZE def get_size_remaining(self): size, max_size = self.redis.hmget(self.info_key, ['size', 'max_size']) rem = 0 try: if not size: size = 0 if not max_size: max_size = self.MAX_USER_SIZE max_size = int(max_size) size = int(size) rem = max_size - size except Exception as e: print(e) return rem def is_out_of_space(self): self.access.assert_is_curr_user(self) return self.get_size_remaining() <= 0 def mark_skip_url(self, url): key = self.URL_SKIP_KEY.format(user=self.my_id, url=url) r = self.redis.setex(key, self.SKIP_KEY_SECS, 1) def is_anon(self): return self.name.startswith('temp-') def get_space_usage(self): total = self.get_size_allotment() avail = self.get_size_remaining() data = { 'total': total, 'used': total - avail, 'available': avail, } return data def serialize(self, include_colls=False): full = self.access.is_logged_in_user(self) or self.access.is_superuser() all_data = super(User, self).serialize(include_duration=full) data = {'username': self.name} allowed_props = self.SERIALIZE_PROPS if not full else self.SERIALIZE_FULL_PROPS for prop in allowed_props: if prop in all_data: data[prop] = all_data[prop] colls = self.get_collections() data['num_collections'] = len(colls) if include_colls: data['collections'] = [coll.serialize( include_recordings=False, include_pages=False, include_lists=False) for coll in colls] # if not owner or superuser, return here, otherwise add additional properties if not full: return data data['space_utilization'] = self.get_space_usage() if self.is_anon(): data['anon'] = True data['role'] = 'anon' data['ttl'] = self.access.get_anon_ttl() collection = self.get_collection_by_name('temp') if collection: data['num_recordings'] = collection.num_recordings() else: data['anon'] = False data['role'] = self['role'] last_login = self.get_prop('last_login') if last_login: data['last_login'] = self.to_iso_date(last_login) return data def update_last_login(self): self.set_prop('last_login', int(datetime.utcnow().timestamp())) def __eq__(self, obj): if obj and (self.my_id == obj.my_id) and isinstance(obj, User): return True else: return False @property def curr_role(self): return self['role'] def is_rate_limited(self, ip): if not self.rate_limit_hours or not self.rate_limit_max: return None if self.access.is_superuser(): return None if self.curr_role in ('rate-unlimited-archivist', 'supporter', 'free-supporter'): return None rate_key = self.RATE_LIMIT_KEY.format(ip=ip, H='') h = int(datetime.utcnow().strftime('%H')) if ip in self.rate_restricted_ips: limit_hours = self.rate_limit_restricted_hours limit_max = self.rate_limit_restricted_max else: limit_hours = self.rate_limit_hours limit_max = self.rate_limit_max rate_keys = [rate_key + '%02d' % ((h - i) % 24) for i in range(0, limit_hours)] values = self.redis.mget(rate_keys) total = sum(int(v) for v in values if v) return (total >= limit_max) def get_user_temp_warc_path(self): return os.path.join(os.environ['RECORD_ROOT'], self.name) def is_owner(self, owner): return self == owner
class Collection(PagesMixin, RedisUniqueComponent): """Collection Redis building block. :cvar str RECS_KEY: recordings Redis key :cvar str LISTS_KEY: lists Redis key :cvar str LIST_NAMES_KEY: list names Redis key :cvar str LIST_REDIR_KEY: list redirect Redis key :cvar str COLL_CDXJ_KEY: CDX index file Redis key :cvar str CLOSE_WAIT_KEY: n.s. :cvar str COMMIT_WAIT_KEY: n.s. :cvar str INDEX_FILE_KEY: CDX index file :cvar int COMMIT_WAIT_SECS: wait for the given number of seconds :cvar str DEFAULT_COLL_DESC: default description :cvar str DEFAULT_STORE_TYPE: default Webrecorder storage :cvar int COLL_CDXJ_TTL: TTL of CDX index file :ivar RedisUnorderedList recs: recordings :ivar RedisOrderedList lists: n.s. :ivar RedisNamedMap list_names: n.s. """ MY_TYPE = 'coll' INFO_KEY = 'c:{coll}:info' ALL_KEYS = 'c:{coll}:*' RECS_KEY = 'c:{coll}:recs' LISTS_KEY = 'c:{coll}:lists' LIST_NAMES_KEY = 'c:{coll}:ln' LIST_REDIR_KEY = 'c:{coll}:lr' COLL_CDXJ_KEY = 'c:{coll}:cdxj' CLOSE_WAIT_KEY = 'c:{coll}:wait:{id}' COMMIT_WAIT_KEY = 'w:{filename}' INDEX_FILE_KEY = '@index_file' COMMIT_WAIT_SECS = 30 DEFAULT_COLL_DESC = '' DEFAULT_STORE_TYPE = 'local' COLL_CDXJ_TTL = 1800 def __init__(self, **kwargs): """Initialize collection Redis building block.""" super(Collection, self).__init__(**kwargs) self.recs = RedisUnorderedList(self.RECS_KEY, self) self.lists = RedisOrderedList(self.LISTS_KEY, self) self.list_names = RedisNamedMap(self.LIST_NAMES_KEY, self, self.LIST_REDIR_KEY) @classmethod def init_props(cls, config): """Initialize class variables. :param dict config: Webrecorder configuration """ cls.COLL_CDXJ_TTL = int(config['coll_cdxj_ttl']) cls.DEFAULT_STORE_TYPE = os.environ.get('DEFAULT_STORAGE', 'local') cls.DEFAULT_COLL_DESC = config['coll_desc'] cls.COMMIT_WAIT_SECS = int(config['commit_wait_secs']) def create_recording(self, **kwargs): """Create recording. :returns: recording :rtype: Recording """ self.access.assert_can_admin_coll(self) recording = Recording(redis=self.redis, access=self.access) rec = recording.init_new(**kwargs) self.recs.add_object(recording, owner=True) return recording def move_recording(self, obj, new_collection): """Move recording into new collection. :param Recording obj: recording :param new_collection: new collection :returns: name of new recording or None :rtype: str or None """ new_recording = new_collection.create_recording() if new_recording.copy_data_from_recording(obj, delete_source=True): return new_recording.name return None def create_bookmark_list(self, props): """Create list of bookmarks. :param dict props: properties :returns: list of bookmarks :rtype: BookmarkList """ self.access.assert_can_write_coll(self) bookmark_list = BookmarkList(redis=self.redis, access=self.access) bookmark_list.init_new(self, props) before_blist = self.get_list(props.get('before_id')) self.lists.insert_ordered_object(bookmark_list, before_blist) slug = self.get_list_slug(props.get('title')) if slug: self.list_names.add_object(slug, bookmark_list) return bookmark_list def get_lists(self, load=True, public_only=False): """Return lists of bookmarks. :param bool load: whether to load Redis entries :param bool public_only: whether only to load public lists :returns: lists of bookmarks :rtype: list """ self.access.assert_can_read_coll(self) lists = self.lists.get_ordered_objects(BookmarkList, load=load) if public_only or not self.access.can_write_coll(self): lists = [blist for blist in lists if blist.is_public()] #lists = [blist for blist in lists if self.access.can_read_list(blist)] return lists def get_list_slug(self, title): """Return reserved field name. :param str title: title :returns: reserved field name :rtype: str """ if not title: return slug = sanitize_title(title) if not slug: return return self.list_names.reserve_obj_name(slug, allow_dupe=True) def update_list_slug(self, new_title, bookmark_list): """Rename list field name. :param str new_title: new field name :param BookmarkList bookmark_list: list of bookmarks :returns: whether successful or not :rtype: bool or None """ old_title = bookmark_list.get_prop('title') if old_title == new_title: return False new_slug = self.list_names.rename(bookmark_list, sanitize_title(new_title)) return new_slug is not None def get_list(self, blist_id): """Return list of bookmarks. :param str blist_id: list ID :returns: list of bookmarks :rtype: BookmarkList or None """ if not self.lists.contains_id(blist_id): return None bookmark_list = BookmarkList(my_id=blist_id, redis=self.redis, access=self.access) bookmark_list.owner = self if not self.access.can_read_list(bookmark_list): return None return bookmark_list def get_list_by_slug_or_id(self, slug_or_id): """Return list of bookmarks. :param str slug_or_id: either list ID or list title :returns: list of bookmarks :rtype: BookmarkList or None """ # see if its a slug, otherwise treat as id blist_id = self.list_names.name_to_id(slug_or_id) or slug_or_id return self.get_list(blist_id) def move_list_before(self, blist, before_blist): """Move list of bookmarks in ordered list. :param str blist: list ID :param str before_blist: list ID """ self.access.assert_can_write_coll(self) self.lists.insert_ordered_object(blist, before_blist) def remove_list(self, blist): """Remove list of bookmarks from ordered list. :param str blist: list ID :returns: whether successful or not :rtype: bool """ self.access.assert_can_write_coll(self) if not self.lists.remove_ordered_object(blist): return False self.list_names.remove_object(blist) blist.delete_me() return True def num_lists(self): """Return number of lists of bookmarks. :returns: number of lists :rtype: int """ if self.access.assert_can_write_coll(self): return self.lists.num_ordered_objects() else: return len(self.get_lists()) def init_new(self, slug, title, desc='', public=False, public_index=False): """Initialize new collection. :param str title: title :param str desc: description :param bool public: whether collection is public :param bool public_index: whether CDX index file is public :returns: collection :rtype: Collection """ coll = self._create_new_id() key = self.INFO_KEY.format(coll=coll) self.data = { 'title': title, 'size': 0, 'desc': desc, 'public': self._from_bool(public), 'public_index': self._from_bool(public_index), } self._init_new() return coll def get_recording(self, rec): """Return recording. :param str rec: recording ID :returns: recording :rtype: Recording or None """ if not self.recs.contains_id(rec): return None recording = Recording(my_id=rec, name=rec, redis=self.redis, access=self.access) recording.owner = self return recording def num_recordings(self): """Return number of recordings. :returns: number of recordings :rtype: int """ return self.recs.num_objects() def get_recordings(self, load=True): """Return recordings. :param bool load: whether to load Redis entries :returns: list of recordings :rtype: list """ return self.recs.get_objects(Recording, load=load) def _get_rec_keys(self, key_templ): """Return recording Redis keys. :param str key_templ: Redis key template :returns: recording Redis keys :rtype: list """ self.access.assert_can_read_coll(self) key_pattern = key_templ.format(rec='*') #comp_map = self.get_comp_map() #recs = self.redis.hvals(comp_map) recs = self.recs.get_keys() return [key_pattern.replace('*', rec) for rec in recs] def get_warc_key(self): return Recording.COLL_WARC_KEY.format(coll=self.my_id) def commit_all(self, commit_id=None): # see if pending commits have been finished if commit_id: commit_key = self.CLOSE_WAIT_KEY.format(coll=self.my_id, id=commit_id) open_rec_ids = self.redis.smembers(commit_key) still_waiting = False for rec_id in open_rec_ids: recording = self.get_recording(rec_id) if recording.is_fully_committed(): continue still_waiting = True if not still_waiting: self.redis.delete(commit_key) return None return commit_id open_recs = [] for recording in self.get_recordings(): if recording.is_open(): recording.set_closed() recording.commit_to_storage() elif recording.is_fully_committed(): continue open_recs.append(recording) if not open_recs: return None commit_id = get_new_id(5) commit_key = self.CLOSE_WAIT_KEY.format(coll=self.my_id, id=commit_id) open_keys = [recording.my_id for recording in open_recs] self.redis.sadd(commit_key, *open_keys) self.redis.expire(commit_key, 200) return commit_id def import_serialized(self, data, coll_dir): page_id_map = {} self.set_external(True) for rec_data in data['recordings']: # CREATE RECORDING recording = self.create_recording(title=data.get('title'), desc=data.get('desc'), rec_type=data.get('rec_type'), ra_list=data.get('ra')) # Files files = rec_data.get('files') # WARCS if files: for filename in files.get('warcs', []): full_filename = os.path.join(coll_dir, 'warcs', filename) rec_warc_key = recording.REC_WARC_KEY.format( rec=recording.my_id) coll_warc_key = self.get_warc_key() self.redis.hset(coll_warc_key, filename, full_filename) self.redis.sadd(rec_warc_key, filename) # CDX index_files = files.get('indexes', []) if index_files: index_filename = os.path.join(coll_dir, 'indexes', index_files[0]) with open(index_filename, 'rb') as fh: self.add_cdxj(fh.read()) recording.set_prop(recording.INDEX_FILE_KEY, index_filename) # PAGES pages = rec_data.get('pages') if pages: page_id_map.update(self.import_pages(pages, recording)) self.set_date_prop('created_at', rec_data) self.set_date_prop('recorded_at', rec_data, 'updated_at') self.set_date_prop('updated_at', rec_data) # props self.set_date_prop('created_at', data) self.set_date_prop('updated_at', data) # LISTS lists = data.get('lists') if not lists: return for list_data in lists: bookmarks = list_data.pop('bookmarks', []) list_data['public'] = True blist = self.create_bookmark_list(list_data) for bookmark_data in bookmarks: page_id = bookmark_data.get('page_id') if page_id: bookmark_data['page_id'] = page_id_map.get(page_id) bookmark = blist.create_bookmark(bookmark_data, incr_stats=False) def serialize(self, include_recordings=True, include_lists=True, include_rec_pages=False, include_pages=True, include_bookmarks='first', convert_date=True, check_slug=False, include_files=False): data = super(Collection, self).serialize(convert_date=convert_date) data['id'] = self.name if check_slug: data['slug_matched'] = (check_slug == data.get('slug')) is_owner = self.access.is_coll_owner(self) if include_recordings: recordings = self.get_recordings(load=True) rec_serialized = [] duration = 0 for recording in recordings: rec_data = recording.serialize(include_pages=include_rec_pages, include_files=include_files) rec_serialized.append(rec_data) duration += rec_data.get('duration', 0) if is_owner: data['recordings'] = rec_serialized data['duration'] = duration if include_lists: lists = self.get_lists(load=True, public_only=False) data['lists'] = [ blist.serialize(include_bookmarks=include_bookmarks, convert_date=convert_date) for blist in lists ] if not data.get('desc'): data['desc'] = self.DEFAULT_COLL_DESC.format(self.name) data['public'] = self.is_public() data['public_index'] = self.get_bool_prop('public_index', False) if DatShare.DAT_SHARE in data: data[DatShare.DAT_SHARE] = self.get_bool_prop( DatShare.DAT_SHARE, False) if DatShare.DAT_UPDATED_AT in data: data[DatShare.DAT_UPDATED_AT] = self.to_iso_date( data[DatShare.DAT_UPDATED_AT]) if include_pages: if is_owner or data['public_index']: data['pages'] = self.list_pages() data.pop('num_downloads', '') return data def remove_recording(self, recording, delete=False): self.access.assert_can_admin_coll(self) if not recording: return {'error': 'no_recording'} if not self.recs.remove_object(recording): return {'error': 'not_found'} else: self.incr_size(-recording.size) size = recording.size user = self.get_owner() if user: user.incr_size(-recording.size) if delete: storage = self.get_storage() return recording.delete_me(storage) self.sync_coll_index(exists=True, do_async=True) return {} def delete_me(self): self.access.assert_can_admin_coll(self) storage = self.get_storage() errs = {} for recording in self.get_recordings(load=False): errs.update(recording.delete_me(storage, pages=False)) for blist in self.get_lists(load=False): blist.delete_me() if storage: if not storage.delete_collection(self): errs['error_delete_coll'] = 'not_found' if not self.delete_object(): errs['error'] = 'not_found' if DatShare.dat_share: DatShare.dat_share.unshare(self) return errs def get_storage(self): storage_type = self.get_prop('storage_type') if not storage_type: storage_type = self.DEFAULT_STORE_TYPE return get_global_storage(storage_type, self.redis) def get_created_iso_date(self): try: dt_str = date.fromtimestamp(int(self['created_at'])).isoformat() except: dt_str = self['created_at'][:10] return dt_str def get_dir_path(self): return self.get_created_iso_date() + '/' + self.my_id def add_cdxj(self, cdxj_text): if not self.is_external(): return 0 coll_cdxj_key = self.COLL_CDXJ_KEY.format(coll=self.my_id) count = 0 for line in cdxj_text.split(b'\n'): if not line: continue try: cdx = CDXObject(line) self.redis.zadd(coll_cdxj_key, 0, str(cdx)) count += 1 except: pass #self.redis.expire(coll_cdxj_key, self.COLL_CDXJ_TTL) return count def add_warcs(self, warc_map): if not self.is_external(): return 0 warc_key = self.get_warc_key() if warc_map: self.redis.hmset(warc_key, warc_map) return len(warc_map) def is_external(self): return self.get_bool_prop('external') def set_external(self, external): self.set_bool_prop('external', external) def commit_file(self, filename, full_filename, obj_type, update_key=None, update_prop=None, direct_delete=False): user = self.get_owner() storage = self.get_storage() if not storage: return True orig_full_filename = full_filename full_filename = strip_prefix(full_filename) # not a local filename if '://' in full_filename and not full_filename.startswith('local'): return True if not os.path.isfile(full_filename): return True commit_wait = self.COMMIT_WAIT_KEY.format(filename=full_filename) if self.redis.set(commit_wait, '1', ex=self.COMMIT_WAIT_SECS, nx=True): if not storage.upload_file(user, self, None, filename, full_filename, obj_type): self.redis.delete(commit_wait) return False # already uploaded, see if it is accessible # if so, finalize and delete original remote_url = storage.get_upload_url(filename) if not remote_url: print('Not yet available: {0}'.format(full_filename)) return False print('Committed {0} -> {1}'.format(full_filename, remote_url)) if update_key: update_prop = update_prop or filename self.redis.hset(update_key, update_prop, remote_url) # just in case, if remote_url is actually same as original (local file double-commit?), just return if remote_url == orig_full_filename: return True # if direct delete, call os.remove directly # used for CDXJ files which are not owned by a writer if direct_delete: try: os.remove(full_filename) except Exception as e: print(e) return True else: # for WARCs, send handle_delete to ensure writer can close the file if self.redis.publish('handle_delete_file', full_filename) < 1: print('No Delete Listener!') return True def sync_coll_index(self, exists=False, do_async=False): coll_cdxj_key = self.COLL_CDXJ_KEY.format(coll=self.my_id) if exists != self.redis.exists(coll_cdxj_key): if self.COLL_CDXJ_TTL > 0: self.redis.expire(coll_cdxj_key, self.COLL_CDXJ_TTL) return cdxj_keys = self._get_rec_keys(Recording.CDXJ_KEY) if not cdxj_keys: return self.redis.zunionstore(coll_cdxj_key, cdxj_keys) if self.COLL_CDXJ_TTL > 0: self.redis.expire(coll_cdxj_key, self.COLL_CDXJ_TTL) ges = [] for cdxj_key in cdxj_keys: if self.redis.exists(cdxj_key): continue ges.append( gevent.spawn(self._do_download_cdxj, cdxj_key, coll_cdxj_key)) if not do_async: res = gevent.joinall(ges) def _do_download_cdxj(self, cdxj_key, output_key): lock_key = None try: rec_info_key = cdxj_key.rsplit(':', 1)[0] + ':info' cdxj_filename = self.redis.hget(rec_info_key, self.INDEX_FILE_KEY) if not cdxj_filename: logging.debug('No index for ' + rec_info_key) return lock_key = cdxj_key + ':_' logging.debug('Downloading for {0} file {1}'.format( rec_info_key, cdxj_filename)) attempts = 0 if not self.redis.set(lock_key, 1, nx=True): logging.warning('Already downloading, skipping') lock_key = None return while attempts < 10: fh = None try: fh = load(cdxj_filename) buff = fh.read() for cdxj_line in buff.splitlines(): self.redis.zadd(output_key, 0, cdxj_line) break except Exception as e: import traceback traceback.print_exc() logging.error('Could not load: ' + cdxj_filename) attempts += 1 finally: if fh: fh.close() if self.COLL_CDXJ_TTL > 0: self.redis.expire(output_key, self.COLL_CDXJ_TTL) except Exception as e: logging.error('Error downloading cache: ' + str(e)) import traceback traceback.print_exc() finally: if lock_key: self.redis.delete(lock_key)
class Collection(PagesMixin, RedisUniqueComponent): """Collection Redis building block. :cvar str RECS_KEY: recordings Redis key :cvar str LISTS_KEY: lists Redis key :cvar str LIST_NAMES_KEY: list names Redis key :cvar str LIST_REDIR_KEY: list redirect Redis key :cvar str COLL_CDXJ_KEY: CDX index file Redis key :cvar str CLOSE_WAIT_KEY: n.s. :cvar str COMMIT_WAIT_KEY: n.s. :cvar str INDEX_FILE_KEY: CDX index file :cvar int COMMIT_WAIT_SECS: wait for the given number of seconds :cvar str DEFAULT_COLL_DESC: default description :cvar str DEFAULT_STORE_TYPE: default Webrecorder storage :cvar int COLL_CDXJ_TTL: TTL of CDX index file :ivar RedisUnorderedList recs: recordings :ivar RedisOrderedList lists: n.s. :ivar RedisNamedMap list_names: n.s. """ MY_TYPE = 'coll' INFO_KEY = 'c:{coll}:info' ALL_KEYS = 'c:{coll}:*' RECS_KEY = 'c:{coll}:recs' LISTS_KEY = 'c:{coll}:lists' LIST_NAMES_KEY = 'c:{coll}:ln' LIST_REDIR_KEY = 'c:{coll}:lr' AUTO_KEY = 'c:{coll}:autos' COLL_CDXJ_KEY = 'c:{coll}:cdxj' CLOSE_WAIT_KEY = 'c:{coll}:wait:{id}' EXTERNAL_KEY = 'c:{coll}:ext' COMMIT_WAIT_KEY = 'w:{filename}' INDEX_FILE_KEY = '@index_file' COMMIT_WAIT_SECS = 30 DEFAULT_COLL_DESC = '' DEFAULT_STORE_TYPE = 'local' COLL_CDXJ_TTL = 1800 def __init__(self, **kwargs): """Initialize collection Redis building block.""" super(Collection, self).__init__(**kwargs) self.recs = RedisUnorderedList(self.RECS_KEY, self) self.lists = RedisOrderedList(self.LISTS_KEY, self) self.list_names = RedisNamedMap(self.LIST_NAMES_KEY, self, self.LIST_REDIR_KEY) @classmethod def init_props(cls, config): """Initialize class variables. :param dict config: Webrecorder configuration """ cls.COLL_CDXJ_TTL = int(config['coll_cdxj_ttl']) cls.DEFAULT_STORE_TYPE = os.environ.get('DEFAULT_STORAGE', 'local') cls.DEFAULT_COLL_DESC = config['coll_desc'] cls.COMMIT_WAIT_SECS = int(config['commit_wait_secs']) def create_recording(self, **kwargs): """Create recording. :returns: recording :rtype: Recording """ self.access.assert_can_admin_coll(self) recording = Recording(redis=self.redis, access=self.access) rec = recording.init_new(**kwargs) self.recs.add_object(recording, owner=True) return recording def move_recording(self, obj, new_collection): """Move recording into new collection. :param Recording obj: recording :param new_collection: new collection :returns: name of new recording or None :rtype: str or None """ new_recording = new_collection.create_recording() if new_recording.copy_data_from_recording(obj, delete_source=True): return new_recording.name return None def create_auto(self, props=None): self.access.assert_can_admin_coll(self) auto = Auto(redis=self.redis, access=self.access) aid = auto.init_new(self, props) self.redis.sadd(self.AUTO_KEY.format(coll=self.my_id), aid) return aid def get_auto(self, aid): if not self.access.can_admin_coll(self): return None auto = Auto(my_id=aid, redis=self.redis, access=self.access) if auto['owner'] != self.my_id: return None auto.owner = self return auto def get_autos(self): return [self.get_auto(aid) for aid in self.redis.smembers(self.AUTO_KEY.format(coll=self.my_id))] def remove_auto(self, auto): self.access.assert_can_admin_coll(self) count = self.redis.srem(self.AUTO_KEY.format(coll=self.my_id)) if not count: return False return auto.delete_me() def create_bookmark_list(self, props): """Create list of bookmarks. :param dict props: properties :returns: list of bookmarks :rtype: BookmarkList """ self.access.assert_can_write_coll(self) bookmark_list = BookmarkList(redis=self.redis, access=self.access) bookmark_list.init_new(self, props) before_blist = self.get_list(props.get('before_id')) self.lists.insert_ordered_object(bookmark_list, before_blist) slug = self.get_list_slug(props.get('title')) if slug: self.list_names.add_object(slug, bookmark_list) return bookmark_list def get_lists(self, load=True, public_only=False): """Return lists of bookmarks. :param bool load: whether to load Redis entries :param bool public_only: whether only to load public lists :returns: lists of bookmarks :rtype: list """ self.access.assert_can_read_coll(self) lists = self.lists.get_ordered_objects(BookmarkList, load=load) if public_only or not self.access.can_write_coll(self): lists = [blist for blist in lists if blist.is_public()] #lists = [blist for blist in lists if self.access.can_read_list(blist)] return lists def get_list_slug(self, title): """Return reserved field name. :param str title: title :returns: reserved field name :rtype: str """ if not title: return slug = sanitize_title(title) if not slug: return return self.list_names.reserve_obj_name(slug, allow_dupe=True) def update_list_slug(self, new_title, bookmark_list): """Rename list field name. :param str new_title: new field name :param BookmarkList bookmark_list: list of bookmarks :returns: whether successful or not :rtype: bool or None """ old_title = bookmark_list.get_prop('title') if old_title == new_title: return False new_slug = self.list_names.rename(bookmark_list, sanitize_title(new_title)) return new_slug is not None def get_list(self, blist_id): """Return list of bookmarks. :param str blist_id: list ID :returns: list of bookmarks :rtype: BookmarkList or None """ if not self.lists.contains_id(blist_id): return None bookmark_list = BookmarkList(my_id=blist_id, redis=self.redis, access=self.access) bookmark_list.owner = self if not self.access.can_read_list(bookmark_list): return None return bookmark_list def get_list_by_slug_or_id(self, slug_or_id): """Return list of bookmarks. :param str slug_or_id: either list ID or list title :returns: list of bookmarks :rtype: BookmarkList or None """ # see if its a slug, otherwise treat as id blist_id = self.list_names.name_to_id(slug_or_id) or slug_or_id return self.get_list(blist_id) def move_list_before(self, blist, before_blist): """Move list of bookmarks in ordered list. :param str blist: list ID :param str before_blist: list ID """ self.access.assert_can_write_coll(self) self.lists.insert_ordered_object(blist, before_blist) def remove_list(self, blist): """Remove list of bookmarks from ordered list. :param str blist: list ID :returns: whether successful or not :rtype: bool """ self.access.assert_can_write_coll(self) if not self.lists.remove_ordered_object(blist): return False self.list_names.remove_object(blist) blist.delete_me() return True def num_lists(self): """Return number of lists of bookmarks. :returns: number of lists :rtype: int """ if self.access.assert_can_write_coll(self): return self.lists.num_ordered_objects() else: return len(self.get_lists()) def init_new(self, slug, title, desc='', public=False, public_index=False): """Initialize new collection. :param str title: title :param str desc: description :param bool public: whether collection is public :param bool public_index: whether CDX index file is public :returns: collection :rtype: Collection """ coll = self._create_new_id() key = self.INFO_KEY.format(coll=coll) self.data = {'title': title, 'size': 0, 'desc': desc, 'public': self._from_bool(public), 'public_index': self._from_bool(public_index), } self._init_new() return coll def get_recording(self, rec): """Return recording. :param str rec: recording ID :returns: recording :rtype: Recording or None """ if not self.recs.contains_id(rec): return None recording = Recording(my_id=rec, name=rec, redis=self.redis, access=self.access) recording.owner = self return recording def num_recordings(self): """Return number of recordings. :returns: number of recordings :rtype: int """ return self.recs.num_objects() def get_recordings(self, load=True): """Return recordings. :param bool load: whether to load Redis entries :returns: list of recordings :rtype: list """ return self.recs.get_objects(Recording, load=load) def _get_rec_keys(self, key_templ): """Return recording Redis keys. :param str key_templ: Redis key template :returns: recording Redis keys :rtype: list """ self.access.assert_can_read_coll(self) key_pattern = key_templ.format(rec='*') #comp_map = self.get_comp_map() #recs = self.redis.hvals(comp_map) recs = self.recs.get_keys() return [key_pattern.replace('*', rec) for rec in recs] def get_warc_key(self): return Recording.COLL_WARC_KEY.format(coll=self.my_id) def commit_all(self, commit_id=None): # see if pending commits have been finished if commit_id: commit_key = self.CLOSE_WAIT_KEY.format(coll=self.my_id, id=commit_id) open_rec_ids = self.redis.smembers(commit_key) still_waiting = False for rec_id in open_rec_ids: recording = self.get_recording(rec_id) if recording.is_fully_committed(): continue still_waiting = True if not still_waiting: self.redis.delete(commit_key) return None return commit_id open_recs = [] for recording in self.get_recordings(): if recording.is_open(): recording.set_closed() recording.commit_to_storage() elif recording.is_fully_committed(): continue open_recs.append(recording) if not open_recs: return None commit_id = get_new_id(5) commit_key = self.CLOSE_WAIT_KEY.format(coll=self.my_id, id=commit_id) open_keys = [recording.my_id for recording in open_recs] self.redis.sadd(commit_key, *open_keys) self.redis.expire(commit_key, 200) return commit_id def import_serialized(self, data, coll_dir): page_id_map = {} self.set_external(True) for rec_data in data['recordings']: # CREATE RECORDING recording = self.create_recording(title=data.get('title'), desc=data.get('desc'), rec_type=data.get('rec_type'), ra_list=data.get('ra')) # Files files = rec_data.get('files') # WARCS if files: for filename in files.get('warcs', []): full_filename = os.path.join(coll_dir, 'warcs', filename) rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id) coll_warc_key = self.get_warc_key() self.redis.hset(coll_warc_key, filename, full_filename) self.redis.sadd(rec_warc_key, filename) # CDX index_files = files.get('indexes', []) if index_files: index_filename = os.path.join(coll_dir, 'indexes', index_files[0]) with open(index_filename, 'rb') as fh: self.add_cdxj(fh.read()) recording.set_prop(recording.INDEX_FILE_KEY, index_filename) # PAGES pages = rec_data.get('pages') if pages: page_id_map.update(self.import_pages(pages, recording)) self.set_date_prop('created_at', rec_data) self.set_date_prop('recorded_at', rec_data, 'updated_at') self.set_date_prop('updated_at', rec_data) # props self.set_date_prop('created_at', data) self.set_date_prop('updated_at', data) # LISTS lists = data.get('lists') if not lists: return for list_data in lists: bookmarks = list_data.pop('bookmarks', []) list_data['public'] = True blist = self.create_bookmark_list(list_data) for bookmark_data in bookmarks: page_id = bookmark_data.get('page_id') if page_id: bookmark_data['page_id'] = page_id_map.get(page_id) bookmark = blist.create_bookmark(bookmark_data, incr_stats=False) def serialize(self, include_recordings=True, include_lists=True, include_rec_pages=False, include_pages=True, include_bookmarks='first', convert_date=True, check_slug=False, include_files=False): data = super(Collection, self).serialize(convert_date=convert_date) data['id'] = self.name if check_slug: data['slug_matched'] = (check_slug == data.get('slug')) is_owner = self.access.is_coll_owner(self) if include_recordings: recordings = self.get_recordings(load=True) rec_serialized = [] duration = 0 for recording in recordings: rec_data = recording.serialize(include_pages=include_rec_pages, include_files=include_files) rec_serialized.append(rec_data) duration += rec_data.get('duration', 0) if is_owner: data['recordings'] = rec_serialized data['duration'] = duration if include_lists: lists = self.get_lists(load=True, public_only=False) data['lists'] = [blist.serialize(include_bookmarks=include_bookmarks, convert_date=convert_date) for blist in lists] if not data.get('desc'): data['desc'] = self.DEFAULT_COLL_DESC.format(self.name) data['public'] = self.is_public() data['public_index'] = self.get_bool_prop('public_index', False) if DatShare.DAT_SHARE in data: data[DatShare.DAT_SHARE] = self.get_bool_prop(DatShare.DAT_SHARE, False) if DatShare.DAT_UPDATED_AT in data: data[DatShare.DAT_UPDATED_AT] = self.to_iso_date(data[DatShare.DAT_UPDATED_AT]) if include_pages: if is_owner or data['public_index']: data['pages'] = self.list_pages() data.pop('num_downloads', '') return data def remove_recording(self, recording, delete=False): self.access.assert_can_admin_coll(self) if not recording: return {'error': 'no_recording'} if not self.recs.remove_object(recording): return {'error': 'not_found'} else: self.incr_size(-recording.size) size = recording.size user = self.get_owner() if user: user.incr_size(-recording.size) if delete: storage = self.get_storage() return recording.delete_me(storage) self.sync_coll_index(exists=True, do_async=True) return {} def delete_me(self): self.access.assert_can_admin_coll(self) storage = self.get_storage() errs = {} for recording in self.get_recordings(load=False): errs.update(recording.delete_me(storage, pages=False)) for blist in self.get_lists(load=False): blist.delete_me() for auto in self.get_autos(): if auto: auto.delete_me() if storage: if not storage.delete_collection(self): errs['error_delete_coll'] = 'not_found' if not self.delete_object(): errs['error'] = 'not_found' if DatShare.dat_share: DatShare.dat_share.unshare(self) return errs def get_storage(self): storage_type = self.get_prop('storage_type') if not storage_type: storage_type = self.DEFAULT_STORE_TYPE return get_global_storage(storage_type, self.redis) def get_created_iso_date(self): try: dt_str = date.fromtimestamp(int(self['created_at'])).isoformat() except: dt_str = self['created_at'][:10] return dt_str def get_dir_path(self): return self.get_created_iso_date() + '/' + self.my_id def add_cdxj(self, cdxj_text): if not self.is_external(): return 0 coll_cdxj_key = self.COLL_CDXJ_KEY.format(coll=self.my_id) count = 0 for line in cdxj_text.split(b'\n'): if not line: continue try: cdx = CDXObject(line) self.redis.zadd(coll_cdxj_key, 0, str(cdx)) count += 1 except: pass #self.redis.expire(coll_cdxj_key, self.COLL_CDXJ_TTL) return count def add_warcs(self, warc_map): if not self.is_external(): return 0 warc_key = self.get_warc_key() if warc_map: self.redis.hmset(warc_key, warc_map) return len(warc_map) def is_external(self): return self.get_bool_prop('external') def set_external(self, external): self.set_bool_prop('external', external) def set_external_remove_on_expire(self): key = self.EXTERNAL_KEY.format(coll=self.my_id) self.redis.set(key, '1') def commit_file(self, filename, full_filename, obj_type, update_key=None, update_prop=None, direct_delete=False): user = self.get_owner() storage = self.get_storage() if not storage: logger.debug('Skip File Commit: No Storage') return True orig_full_filename = full_filename full_filename = strip_prefix(full_filename) # not a local filename if '://' in full_filename and not full_filename.startswith('local'): logger.debug('Skip File Commit: Not Local Filename: {0}'.format(full_filename)) return True if not os.path.isfile(full_filename): logger.debug('Fail File Commit: Not Found: {0}'.format(full_filename)) return False commit_wait = self.COMMIT_WAIT_KEY.format(filename=full_filename) if self.redis.set(commit_wait, '1', ex=self.COMMIT_WAIT_SECS, nx=True): if not storage.upload_file(user, self, None, filename, full_filename, obj_type): self.redis.delete(commit_wait) return False # already uploaded, see if it is accessible # if so, finalize and delete original remote_url = storage.get_upload_url(filename) if not remote_url: logger.debug('File Commit: Not Yet Available: {0}'.format(full_filename)) return False if update_key: update_prop = update_prop or filename self.redis.hset(update_key, update_prop, remote_url) # just in case, if remote_url is actually same as original (local file double-commit?), just return if remote_url == orig_full_filename: logger.debug('File Already Committed: {0}'.format(remote_url)) return True # if direct delete, call os.remove directly # used for CDXJ files which are not owned by a writer if direct_delete: try: os.remove(full_filename) except Exception as e: traceback.print_exc() else: # for WARCs, send handle_delete to ensure writer can close the file if self.redis.publish('handle_delete_file', full_filename) < 1: logger.debug('No Delete Listener!') logger.debug('File Committed {0} -> {1}'.format(full_filename, remote_url)) return True def has_cdxj(self): coll_cdxj_key = self.COLL_CDXJ_KEY.format(coll=self.my_id) return self.redis.exists(coll_cdxj_key) def sync_coll_index(self, exists=False, do_async=False): coll_cdxj_key = self.COLL_CDXJ_KEY.format(coll=self.my_id) if exists != self.redis.exists(coll_cdxj_key): if self.COLL_CDXJ_TTL > 0: self.redis.expire(coll_cdxj_key, self.COLL_CDXJ_TTL) return cdxj_keys = self._get_rec_keys(Recording.CDXJ_KEY) if not cdxj_keys: return self.redis.zunionstore(coll_cdxj_key, cdxj_keys) if self.COLL_CDXJ_TTL > 0: self.redis.expire(coll_cdxj_key, self.COLL_CDXJ_TTL) ges = [] for cdxj_key in cdxj_keys: if self.redis.exists(cdxj_key): continue ges.append(gevent.spawn(self._do_download_cdxj, cdxj_key, coll_cdxj_key)) if not do_async: res = gevent.joinall(ges) def _do_download_cdxj(self, cdxj_key, output_key): lock_key = None try: rec_info_key = cdxj_key.rsplit(':', 1)[0] + ':info' cdxj_filename = self.redis.hget(rec_info_key, self.INDEX_FILE_KEY) if not cdxj_filename: logger.debug('CDX Sync: No index for ' + rec_info_key) return lock_key = cdxj_key + ':_' logger.debug('CDX Sync: Downloading for {0} file {1}'.format(rec_info_key, cdxj_filename)) attempts = 0 if not self.redis.set(lock_key, 1, ex=self.COMMIT_WAIT_SECS, nx=True): logger.warning('CDX Sync: Already downloading, skipping: {0}'.format(cdxj_filename)) lock_key = None return while attempts < 10: fh = None try: fh = load(cdxj_filename) buff = fh.read() for cdxj_line in buff.splitlines(): self.redis.zadd(output_key, 0, cdxj_line) break except Exception as e: traceback.print_exc() logger.error('CDX Sync: Could not load: ' + cdxj_filename) attempts += 1 finally: if fh: fh.close() if self.COLL_CDXJ_TTL > 0: self.redis.expire(output_key, self.COLL_CDXJ_TTL) except Exception as e: logger.error('CDX Sync: Error downloading cache: ' + str(e)) traceback.print_exc() finally: if lock_key: self.redis.delete(lock_key)