Beispiel #1
0
    def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None

        if (self.index_storage_key is not None and dataset_uuid
                and dataset_uuid in self.index_storage_key):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key
    def store(self, store: StoreInput, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None
        store = ensure_store(store)

        if (self.index_storage_key is not None and dataset_uuid
                and dataset_uuid in self.index_storage_key):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        # The arrow representation of index_dct requires a large amount of memory because strings are duplicated and
        # flattened into the buffer. To avoid a high peak memory usage, split the index_dct into chunks and only convert
        # one chunk a time to arrow.
        parts_iter = partition_all(10_000, self.index_dct.items())

        # Get first table explicit because its schema is required for ParquetWriter.
        try:
            table = _index_dct_to_table(dict(next(parts_iter)), self.column,
                                        self.dtype)
        except StopIteration:
            # index_dct was empty, just pass it entirely
            table = _index_dct_to_table(self.index_dct, self.column,
                                        self.dtype)

        buf = pa.BufferOutputStream()
        with pq.ParquetWriter(buf, schema=table.schema) as writer:
            writer.write_table(table)
            del table

            for part in parts_iter:
                writer.write_table(
                    _index_dct_to_table(dict(part), self.column, self.dtype))

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key