def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None):
        self._db = ccl_leveldb.RawLevelDb(leveldb_dir)
        self._blob_dir = leveldb_blob_dir
        self.global_metadata = GlobalMetadata(self._get_raw_global_metadata())
        self.database_metadata = DatabaseMetadata(self._get_raw_database_metadata())
        self.object_store_meta = ObjectStoreMetadata(self._get_raw_object_store_metadata())

        self._blob_lookup_cache = {}
    def __init__(self,
                 leveldb_dir: os.PathLike,
                 leveldb_blob_dir: os.PathLike = None):
        self._db = ccl_leveldb.RawLevelDb(leveldb_dir)
        self._blob_dir = leveldb_blob_dir
        self.global_metadata = None
        self.database_metadata = None
        self.object_store_meta = None
        self._fetch_meta_data()

        self._blob_lookup_cache = {}
Beispiel #3
0
def main(args):
    input_path = args[0]
    output_path = "leveldb_dump.csv"
    if len(args) > 1:
        output_path = args[1]

    leveldb_records = ccl_leveldb.RawLevelDb(input_path)

    with open(output_path, "w", encoding="utf-8", newline="") as file1:
        writes = csv.writer(file1, quoting=csv.QUOTE_ALL)
        writes.writerow([
            "key-hex", "key-text", "value-hex", "value-text", "origin_file",
            "file_type", "offset", "seq", "state", "was_compressed"
        ])

        for record in leveldb_records.iterate_records_raw():
            writes.writerow([
                record.user_key.hex(" ", 1),
                record.user_key.decode(ENCODING, "replace"),
                record.value.hex(" ", 1),
                record.value.decode(ENCODING, "replace"),
                str(record.origin_file), record.file_type.name, record.offset,
                record.seq, record.state.name, record.was_compressed
            ])
    def __init__(self, in_dir: pathlib.Path):
        if not in_dir.is_dir():
            raise IOError("Input directory is not a directory")

        self._ldb = ccl_leveldb.RawLevelDb(in_dir)

        # If performance is a concern we should refactor this, but slow and steady for now

        # First collect the namespace (session/tab guid  + host) and map-ids together
        self._map_id_to_host = {}  # map_id: (guid, host)
        self._deleted_keys = set()

        for rec in self._ldb.iterate_records_raw():
            if rec.user_key.startswith(_NAMESPACE_PREFIX):
                if rec.user_key == _NAMESPACE_PREFIX:
                    continue  # bogus entry near the top usually
                try:
                    key = rec.user_key.decode("utf-8")
                except UnicodeDecodeError:
                    print(f"Invalid namespace key: {rec.user_key}")
                    continue

                split_key = key.split("-", 2)
                if len(split_key) != 3:
                    print(f"Invalid namespace key: {key}")
                    continue

                _, guid, host = split_key

                if not host:
                    continue  # TODO investigate why this happens

                # normalize host to lower just in case
                host = host.lower()
                guid_host_pair = guid, host

                if rec.state == ccl_leveldb.KeyState.Deleted:
                    self._deleted_keys.add(guid_host_pair)
                else:
                    try:
                        map_id = rec.value.decode("utf-8")
                    except UnicodeDecodeError:
                        print(f"Invalid namespace value: {key}")
                        continue

                    if not map_id:
                        continue  # TODO: investigate why this happens/do we want to keep the host around somewhere?

                    #if map_id in self._map_id_to_host_guid and self._map_id_to_host_guid[map_id] != guid_host_pair:
                    if map_id in self._map_id_to_host and self._map_id_to_host[
                            map_id] != host:
                        print("Map ID Collision!")
                        print(f"map_id: {map_id}")
                        print(f"Old host: {self._map_id_to_host[map_id]}")
                        print(f"New host: {guid_host_pair}")
                        raise ValueError("map_id collision")
                    else:
                        self._map_id_to_host[map_id] = host

        # freeze stuff
        self._map_id_to_host = MappingProxyType(self._map_id_to_host)
        self._deleted_keys = frozenset(self._deleted_keys)

        self._host_lookup = {}  # {host: {ss_key: [SessionStoreValue, ...]}}
        self._orphans = [
        ]  #  list of tuples of key, value where we can't get the host
        for rec in self._ldb.iterate_records_raw():
            if rec.user_key.startswith(_MAP_ID_PREFIX):
                try:
                    key = rec.user_key.decode("utf-8")
                except UnicodeDecodeError:
                    print(f"Invalid map id key: {rec.user_key}")
                    continue

                if rec.state == ccl_leveldb.KeyState.Deleted:
                    continue  # TODO: do we want to keep the key around because the presence is important?

                split_key = key.split("-", 2)
                if len(split_key) != 3:
                    print(f"Invalid map id key: {key}")
                    continue

                _, map_id, ss_key = split_key

                if not split_key:
                    # TODO what does it mean when there is no key here?
                    #      The value will also be a single number (encoded utf-8)
                    continue

                try:
                    value = rec.value.decode("UTF-16-LE")
                except UnicodeDecodeError:
                    print(f"Error decoding value for {key}")
                    print(f"Raw Value: {rec.value}")
                    continue

                #guid_host_pair = self._map_id_to_host_guid.get(map_id)
                host = self._map_id_to_host.get(map_id)
                #if not guid_host_pair:
                if not host:
                    self._orphans.append(
                        (ss_key, SessionStoreValue(value, None, rec.seq)))
                else:
                    #guid, host = guid_host_pair
                    self._host_lookup.setdefault(host, {})
                    self._host_lookup[host].setdefault(ss_key, [])
                    self._host_lookup[host][ss_key].append(
                        SessionStoreValue(value, None, rec.seq))
    def __init__(self, in_dir: pathlib.Path):
        if not in_dir.is_dir():
            raise IOError("Input directory is not a directory")

        self._ldb = ccl_leveldb.RawLevelDb(in_dir)

        self._storage_details = {
        }  # storage_key: {seq_number: StorageMetadata}
        self._flat_items = [
        ]  # [StorageMetadata|LocalStorageRecord]   - used to batch items up
        self._records = {
        }  # storage_key: {script_key: {seq_number: LocalStorageRecord}}

        for record in self._ldb.iterate_records_raw():
            if record.user_key.startswith(
                    _META_PREFIX
            ) and record.state == ccl_leveldb.KeyState.Live:
                # Only live records for metadata - not sure what we can reliably infer from deleted keys
                storage_key = record.user_key.removeprefix(
                    _META_PREFIX).decode(EIGHT_BIT_ENCODING)
                self._storage_details.setdefault(storage_key, {})
                metadata = StorageMetadata.from_protobuff(
                    storage_key, record.value, record.seq)
                self._storage_details[storage_key][record.seq] = metadata
                self._flat_items.append(metadata)
            elif record.user_key.startswith(_RECORD_KEY_PREFIX):
                # We include deleted records here because we need them to build batches
                storage_key_raw, script_key_raw = record.user_key.removeprefix(
                    _RECORD_KEY_PREFIX).split(b"\x00", 1)
                storage_key = storage_key_raw.decode(EIGHT_BIT_ENCODING)
                script_key = decode_string(script_key_raw)

                try:
                    value = decode_string(
                        record.value
                    ) if record.state == ccl_leveldb.KeyState.Live else None
                except UnicodeDecodeError as e:
                    # Some sites play games to test the browser's capabilities like encoding half of a surrogate pair
                    print(
                        f"Error decoding record value at seq no {record.seq}; "
                        f"{storage_key} {script_key}:  {record.value}")
                    continue

                self._records.setdefault(storage_key, {})
                self._records[storage_key].setdefault(script_key, {})

                ls_record = LocalStorageRecord(
                    storage_key, script_key, value, record.seq,
                    record.state == ccl_leveldb.KeyState.Live)
                self._records[storage_key][script_key][record.seq] = ls_record
                self._flat_items.append(ls_record)

        self._storage_details = types.MappingProxyType(self._storage_details)
        self._records = types.MappingProxyType(self._records)

        self._all_storage_keys = frozenset(
            self._storage_details.keys()
            | self._records.keys())  # because deleted data.
        self._flat_items.sort(key=lambda x: x.leveldb_seq_number)

        # organise batches - this is made complex and slow by having to account for missing/deleted data
        # we're looking for a StorageMetadata followed by sequential (in terms of seq number) LocalStorageRecords
        # with the same storage key. Everything that falls within that chain can safely be considered a batch.
        # Any break in sequence numbers or storage key is a fail and can't be considered part of a batch.
        self._batches = {}
        current_meta: typing.Optional[StorageMetadata] = None
        current_end = 0
        for item in self._flat_items:  # pre-sorted
            if isinstance(item, LocalStorageRecord):
                if current_meta is None:
                    # no currently valid metadata so we can't attribute this record to anything
                    continue
                elif item.leveldb_seq_number - current_end != 1 or item.storage_key != current_meta.storage_key:
                    # this record breaks a chain, so bundle up what we have and clear everything out
                    self._batches[
                        current_meta.leveldb_seq_number] = LocalStorageBatch(
                            current_meta, current_end)
                    current_meta = None
                    current_end = 0
                else:
                    # contiguous and right storage key, include in the current chain
                    current_end = item.leveldb_seq_number
            elif isinstance(item, StorageMetadata):
                if current_meta is not None:
                    # this record breaks a chain, so bundle up what we have, set new start
                    self._batches[
                        current_meta.leveldb_seq_number] = LocalStorageBatch(
                            current_meta, current_end)
                current_meta = item
                current_end = item.leveldb_seq_number
            else:
                raise ValueError

        if current_meta is not None:
            self._batches[current_meta.leveldb_seq_number] = LocalStorageBatch(
                current_meta, current_end)

        self._batch_starts = tuple(sorted(self._batches.keys()))