def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None): self._db = ccl_leveldb.RawLevelDb(leveldb_dir) self._blob_dir = leveldb_blob_dir self.global_metadata = GlobalMetadata(self._get_raw_global_metadata()) self.database_metadata = DatabaseMetadata(self._get_raw_database_metadata()) self.object_store_meta = ObjectStoreMetadata(self._get_raw_object_store_metadata()) self._blob_lookup_cache = {}
def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None): self._db = ccl_leveldb.RawLevelDb(leveldb_dir) self._blob_dir = leveldb_blob_dir self.global_metadata = None self.database_metadata = None self.object_store_meta = None self._fetch_meta_data() self._blob_lookup_cache = {}
def main(args): input_path = args[0] output_path = "leveldb_dump.csv" if len(args) > 1: output_path = args[1] leveldb_records = ccl_leveldb.RawLevelDb(input_path) with open(output_path, "w", encoding="utf-8", newline="") as file1: writes = csv.writer(file1, quoting=csv.QUOTE_ALL) writes.writerow([ "key-hex", "key-text", "value-hex", "value-text", "origin_file", "file_type", "offset", "seq", "state", "was_compressed" ]) for record in leveldb_records.iterate_records_raw(): writes.writerow([ record.user_key.hex(" ", 1), record.user_key.decode(ENCODING, "replace"), record.value.hex(" ", 1), record.value.decode(ENCODING, "replace"), str(record.origin_file), record.file_type.name, record.offset, record.seq, record.state.name, record.was_compressed ])
def __init__(self, in_dir: pathlib.Path): if not in_dir.is_dir(): raise IOError("Input directory is not a directory") self._ldb = ccl_leveldb.RawLevelDb(in_dir) # If performance is a concern we should refactor this, but slow and steady for now # First collect the namespace (session/tab guid + host) and map-ids together self._map_id_to_host = {} # map_id: (guid, host) self._deleted_keys = set() for rec in self._ldb.iterate_records_raw(): if rec.user_key.startswith(_NAMESPACE_PREFIX): if rec.user_key == _NAMESPACE_PREFIX: continue # bogus entry near the top usually try: key = rec.user_key.decode("utf-8") except UnicodeDecodeError: print(f"Invalid namespace key: {rec.user_key}") continue split_key = key.split("-", 2) if len(split_key) != 3: print(f"Invalid namespace key: {key}") continue _, guid, host = split_key if not host: continue # TODO investigate why this happens # normalize host to lower just in case host = host.lower() guid_host_pair = guid, host if rec.state == ccl_leveldb.KeyState.Deleted: self._deleted_keys.add(guid_host_pair) else: try: map_id = rec.value.decode("utf-8") except UnicodeDecodeError: print(f"Invalid namespace value: {key}") continue if not map_id: continue # TODO: investigate why this happens/do we want to keep the host around somewhere? #if map_id in self._map_id_to_host_guid and self._map_id_to_host_guid[map_id] != guid_host_pair: if map_id in self._map_id_to_host and self._map_id_to_host[ map_id] != host: print("Map ID Collision!") print(f"map_id: {map_id}") print(f"Old host: {self._map_id_to_host[map_id]}") print(f"New host: {guid_host_pair}") raise ValueError("map_id collision") else: self._map_id_to_host[map_id] = host # freeze stuff self._map_id_to_host = MappingProxyType(self._map_id_to_host) self._deleted_keys = frozenset(self._deleted_keys) self._host_lookup = {} # {host: {ss_key: [SessionStoreValue, ...]}} self._orphans = [ ] # list of tuples of key, value where we can't get the host for rec in self._ldb.iterate_records_raw(): if rec.user_key.startswith(_MAP_ID_PREFIX): try: key = rec.user_key.decode("utf-8") except UnicodeDecodeError: print(f"Invalid map id key: {rec.user_key}") continue if rec.state == ccl_leveldb.KeyState.Deleted: continue # TODO: do we want to keep the key around because the presence is important? split_key = key.split("-", 2) if len(split_key) != 3: print(f"Invalid map id key: {key}") continue _, map_id, ss_key = split_key if not split_key: # TODO what does it mean when there is no key here? # The value will also be a single number (encoded utf-8) continue try: value = rec.value.decode("UTF-16-LE") except UnicodeDecodeError: print(f"Error decoding value for {key}") print(f"Raw Value: {rec.value}") continue #guid_host_pair = self._map_id_to_host_guid.get(map_id) host = self._map_id_to_host.get(map_id) #if not guid_host_pair: if not host: self._orphans.append( (ss_key, SessionStoreValue(value, None, rec.seq))) else: #guid, host = guid_host_pair self._host_lookup.setdefault(host, {}) self._host_lookup[host].setdefault(ss_key, []) self._host_lookup[host][ss_key].append( SessionStoreValue(value, None, rec.seq))
def __init__(self, in_dir: pathlib.Path): if not in_dir.is_dir(): raise IOError("Input directory is not a directory") self._ldb = ccl_leveldb.RawLevelDb(in_dir) self._storage_details = { } # storage_key: {seq_number: StorageMetadata} self._flat_items = [ ] # [StorageMetadata|LocalStorageRecord] - used to batch items up self._records = { } # storage_key: {script_key: {seq_number: LocalStorageRecord}} for record in self._ldb.iterate_records_raw(): if record.user_key.startswith( _META_PREFIX ) and record.state == ccl_leveldb.KeyState.Live: # Only live records for metadata - not sure what we can reliably infer from deleted keys storage_key = record.user_key.removeprefix( _META_PREFIX).decode(EIGHT_BIT_ENCODING) self._storage_details.setdefault(storage_key, {}) metadata = StorageMetadata.from_protobuff( storage_key, record.value, record.seq) self._storage_details[storage_key][record.seq] = metadata self._flat_items.append(metadata) elif record.user_key.startswith(_RECORD_KEY_PREFIX): # We include deleted records here because we need them to build batches storage_key_raw, script_key_raw = record.user_key.removeprefix( _RECORD_KEY_PREFIX).split(b"\x00", 1) storage_key = storage_key_raw.decode(EIGHT_BIT_ENCODING) script_key = decode_string(script_key_raw) try: value = decode_string( record.value ) if record.state == ccl_leveldb.KeyState.Live else None except UnicodeDecodeError as e: # Some sites play games to test the browser's capabilities like encoding half of a surrogate pair print( f"Error decoding record value at seq no {record.seq}; " f"{storage_key} {script_key}: {record.value}") continue self._records.setdefault(storage_key, {}) self._records[storage_key].setdefault(script_key, {}) ls_record = LocalStorageRecord( storage_key, script_key, value, record.seq, record.state == ccl_leveldb.KeyState.Live) self._records[storage_key][script_key][record.seq] = ls_record self._flat_items.append(ls_record) self._storage_details = types.MappingProxyType(self._storage_details) self._records = types.MappingProxyType(self._records) self._all_storage_keys = frozenset( self._storage_details.keys() | self._records.keys()) # because deleted data. self._flat_items.sort(key=lambda x: x.leveldb_seq_number) # organise batches - this is made complex and slow by having to account for missing/deleted data # we're looking for a StorageMetadata followed by sequential (in terms of seq number) LocalStorageRecords # with the same storage key. Everything that falls within that chain can safely be considered a batch. # Any break in sequence numbers or storage key is a fail and can't be considered part of a batch. self._batches = {} current_meta: typing.Optional[StorageMetadata] = None current_end = 0 for item in self._flat_items: # pre-sorted if isinstance(item, LocalStorageRecord): if current_meta is None: # no currently valid metadata so we can't attribute this record to anything continue elif item.leveldb_seq_number - current_end != 1 or item.storage_key != current_meta.storage_key: # this record breaks a chain, so bundle up what we have and clear everything out self._batches[ current_meta.leveldb_seq_number] = LocalStorageBatch( current_meta, current_end) current_meta = None current_end = 0 else: # contiguous and right storage key, include in the current chain current_end = item.leveldb_seq_number elif isinstance(item, StorageMetadata): if current_meta is not None: # this record breaks a chain, so bundle up what we have, set new start self._batches[ current_meta.leveldb_seq_number] = LocalStorageBatch( current_meta, current_end) current_meta = item current_end = item.leveldb_seq_number else: raise ValueError if current_meta is not None: self._batches[current_meta.leveldb_seq_number] = LocalStorageBatch( current_meta, current_end) self._batch_starts = tuple(sorted(self._batches.keys()))