def store(self, store: StoreInput, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None store = ensure_store(store) if (self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) # The arrow representation of index_dct requires a large amount of memory because strings are duplicated and # flattened into the buffer. To avoid a high peak memory usage, split the index_dct into chunks and only convert # one chunk a time to arrow. parts_iter = partition_all(10_000, self.index_dct.items()) # Get first table explicit because its schema is required for ParquetWriter. try: table = _index_dct_to_table(dict(next(parts_iter)), self.column, self.dtype) except StopIteration: # index_dct was empty, just pass it entirely table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() with pq.ParquetWriter(buf, schema=table.schema) as writer: writer.write_table(table) del table for part in parts_iter: writer.write_table( _index_dct_to_table(dict(part), self.column, self.dtype)) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key
def store_dataset_from_partitions( partition_list, store: StoreInput, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = ensure_store(store) if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) dataset_builder.explicit_partitions = True dataset_builder.table_meta = persist_common_metadata( partition_list, update_dataset, store, dataset_uuid) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, partition_list, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset
def store_dataset_from_partitions( partition_list, store: StoreInput, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = ensure_store(store) schemas = set() if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version table_name = update_dataset.table_name schemas.add(update_dataset.schema) else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) table_name = mp.table_name metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) for mp in partition_list: if mp.schema: schemas.add(mp.schema) dataset_builder.schema = persist_common_metadata( schemas=schemas, update_dataset=update_dataset, store=store, dataset_uuid=dataset_uuid, table_name=table_name, ) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) # This could be safely removed since we do not allow to set this by the user # anymore. It has implications on tests if mocks are used non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset