Esempio n. 1
0
def test_store_dataframes_as_dataset(store_factory, metadata_version,
                                     bound_store_dataframes):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df_helper = pd.DataFrame({
        "P": np.arange(0, 10),
        "info": string.ascii_lowercase[:10]
    })

    df_list = [
        {
            "label": "cluster_1",
            "data": [("core", df.copy(deep=True)), ("helper", df_helper)],
        },
        {
            "label": "cluster_2",
            "data": [("core", df.copy(deep=True)), ("helper", df_helper)],
        },
    ]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        secondary_indices=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    assert "P" in dataset.indices

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    index_dct = stored_dataset.indices["P"].load(store).index_dct
    assert sorted(index_dct.keys()) == list(range(0, 10))
    assert any(
        [sorted(p) == ["cluster_1", "cluster_2"] for p in index_dct.values()])

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["core"], store=store)
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["core"], store=store)
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["helper"], store=store)
    pdt.assert_frame_equal(df_helper, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["helper"], store=store)
    pdt.assert_frame_equal(df_helper, df_stored)
Esempio n. 2
0
def test_dataframe_roundtrip_empty(serialiser, store):
    df = pd.DataFrame({})
    key = serialiser.store(store, "prefix", df)
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)

    # Test partial restore
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)
Esempio n. 3
0
def test_dataframe_roundtrip_no_rows(serialiser, store):
    df = pd.DataFrame({"a": [], "b": [], "c": []}).astype(object)
    key = serialiser.store(store, "prefix", df)
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)

    # Test partial restore
    pdt.assert_frame_equal(
        DataFrameSerializer.restore_dataframe(store, key, columns=["a", "c"]),
        df[["a", "c"]],
    )
Esempio n. 4
0
def test_missing_column(serialiser, store):
    df = pd.DataFrame({
        "a": [1, 2],
        "b": [3.0, 4.0],
        "c": ["∆", "€"],
        "d": ["#", ";"]
    })
    key = serialiser.store(store, "prefix", df)

    with pytest.raises(ValueError):
        DataFrameSerializer.restore_dataframe(store, key, columns=["a", "x"])
Esempio n. 5
0
def test_dataframe_roundtrip(serialiser, store):
    if serialiser in TYPE_STABLE_SERIALISERS:
        df = pd.DataFrame({
            "a": [1, 2],
            "b": [3.0, 4.0],
            "c": ["∆", "€"],
            b"d": ["#", ";"]
        })
        key = serialiser.store(store, "prefix", df)
        df.columns = [ensure_unicode_string_type(col) for col in df.columns]
    else:
        df = pd.DataFrame({
            "a": [1, 2],
            "b": [3.0, 4.0],
            "c": ["∆", "€"],
            "d": ["#", ";"]
        })
        key = serialiser.store(store, "prefix", df)

    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)

    # Test partial restore
    pdt.assert_frame_equal(
        DataFrameSerializer.restore_dataframe(store, key, columns=["a", "c"]),
        df[["a", "c"]],
    )

    # Test that all serialisers can ingest predicate_pushdown_to_io
    pdt.assert_frame_equal(
        DataFrameSerializer.restore_dataframe(store,
                                              key,
                                              columns=["a", "c"],
                                              predicate_pushdown_to_io=False),
        df[["a", "c"]],
    )

    # Test that all serialisers can deal with categories
    expected = df[["c", "d"]].copy()
    expected["c"] = expected["c"].astype("category")
    # Check that the dtypes match but don't care about the order of the categoricals.
    pdt.assert_frame_equal(
        DataFrameSerializer.restore_dataframe(store,
                                              key,
                                              columns=["c", "d"],
                                              categories=["c"]),
        expected,
        check_categorical=False,
    )

    # Test restore w/ empty col list
    pdt.assert_frame_equal(
        DataFrameSerializer.restore_dataframe(store, key, columns=[]), df[[]])
Esempio n. 6
0
def test_store_multiple_dataframes_as_partition(store, metadata_storage_format,
                                                metadata_version):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    df_2 = pd.DataFrame({
        "P": np.arange(0, 10),
        "info": string.ascii_lowercase[:10]
    })
    mp = MetaPartition(
        label="cluster_1",
        data={
            "core": df,
            "helper": df_2
        },
        metadata_version=metadata_version,
    )
    meta_partition = mp.store_dataframes(
        store=store,
        df_serializer=None,
        dataset_uuid="dataset_uuid",
        store_metadata=True,
        metadata_storage_format=metadata_storage_format,
    )

    expected_file = "dataset_uuid/core/cluster_1.parquet"
    expected_file_helper = "dataset_uuid/helper/cluster_1.parquet"

    assert meta_partition.files == {
        "core": expected_file,
        "helper": expected_file_helper,
    }
    assert meta_partition.label == "cluster_1"

    files_in_store = list(store.keys())
    assert len(files_in_store) == 2

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_file)

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file_helper)
    pdt.assert_frame_equal(df_2, stored_df)
    files_in_store.remove(expected_file_helper)
Esempio n. 7
0
def test_store_single_dataframe_as_partition(store, metadata_version):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    mp = MetaPartition(label="test_label", data=df, metadata_version=metadata_version)

    meta_partition = mp.store_dataframes(
        store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid",
    )

    assert meta_partition.data is None

    expected_key = "dataset_uuid/table/test_label.parquet"

    assert meta_partition.file == expected_key
    assert meta_partition.label == "test_label"

    files_in_store = list(store.keys())

    expected_num_files = 1
    assert len(files_in_store) == expected_num_files
    stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_key)
    assert len(files_in_store) == expected_num_files - 1
Esempio n. 8
0
def test_store_dataframes_as_dataset_empty_dataframe(store_factory,
                                                     metadata_version,
                                                     df_all_types,
                                                     bound_store_dataframes):
    """
    Test that writing an empty column succeeds.
    In particular, this may fail due to too strict schema validation.
    """
    df_empty = df_all_types.drop(0)

    assert df_empty.empty
    df_list = [df_empty]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 1

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    df_stored = DataFrameSerializer.restore_dataframe(key=next(
        iter(dataset.partitions.values())).files["table"],
                                                      store=store)
    pdt.assert_frame_equal(df_empty, df_stored)
Esempio n. 9
0
def test_store_single_dataframe_as_partition(store, metadata_storage_format,
                                             metadata_version, expected_key):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    mp = MetaPartition(label="test_label",
                       data={"core": df},
                       metadata_version=metadata_version)

    meta_partition = mp.store_dataframes(
        store=store,
        df_serializer=ParquetSerializer(),
        dataset_uuid="dataset_uuid",
        store_metadata=True,
        metadata_storage_format=metadata_storage_format,
    )

    assert len(meta_partition.data) == 0

    assert meta_partition.files == {"core": expected_key}
    assert meta_partition.label == "test_label"

    files_in_store = list(store.keys())

    expected_num_files = 1
    assert len(files_in_store) == expected_num_files
    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_key)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_key)
    assert len(files_in_store) == expected_num_files - 1
Esempio n. 10
0
def test_store_single_dataframe_as_partition_no_metadata(
        store, metadata_version):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    mp = MetaPartition(label="test_label",
                       data={"core": df},
                       metadata_version=metadata_version)
    partition = mp.store_dataframes(
        store=store,
        df_serializer=ParquetSerializer(),
        dataset_uuid="dataset_uuid",
        store_metadata=False,
    )

    assert len(partition.data) == 0

    expected_file = "dataset_uuid/core/test_label.parquet"

    assert partition.files == {"core": expected_file}
    assert partition.label == "test_label"

    # One meta one actual file
    files_in_store = list(store.keys())
    assert len(files_in_store) == 1

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file)
    pdt.assert_frame_equal(df, stored_df)
Esempio n. 11
0
def test_store_df_to_store(store):
    df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["∆", "€"]})
    dataframe_format = default_serializer()
    assert isinstance(dataframe_format, ParquetSerializer)
    key = dataframe_format.store(store, "prefix", df)
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)
Esempio n. 12
0
def test_timestamp_us(store):
    # test that a df with us precision round-trips using parquet
    ts = datetime(2000, 1, 1, 15, 23, 24, 123456)
    df = pd.DataFrame({"ts": [ts]})
    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)
Esempio n. 13
0
def test_predicate_eval_string_types(serialiser, store,
                                     predicate_pushdown_to_io):
    df = pd.DataFrame({b"a": [1, 2], "b": [3.0, 4.0]})
    key = serialiser.store(store, "prefix", df)
    df.columns = [ensure_unicode_string_type(col) for col in df.columns]
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)

    for col in ["a", b"a", "a"]:
        predicates = [[(col, "==", 1)]]
        result_df = serialiser.restore_dataframe(
            store,
            key,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            predicates=predicates,
        )

        expected_df = df.iloc[[0], :].copy()

        pdt.assert_frame_equal(result_df.reset_index(drop=True),
                               expected_df.reset_index(drop=True))

    for col in ["b", b"b", "b"]:
        predicates = [[(col, "==", 3.0)]]
        result_df = serialiser.restore_dataframe(
            store,
            key,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            predicates=predicates,
        )

        expected_df = df.iloc[[0], :].copy()

        pdt.assert_frame_equal(result_df.reset_index(drop=True),
                               expected_df.reset_index(drop=True))

    for preds in (
        [[("a", "==", 1), ("b", "==", 3.0)]],
        [[("a", "==", 1), (b"b", "==", 3.0)]],
        [[(b"a", "==", 1), ("b", "==", 3.0)]],
    ):
        result_df = serialiser.restore_dataframe(
            store,
            key,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            predicates=preds,
        )

        expected_df = df.iloc[[0], :].copy()

        pdt.assert_frame_equal(result_df.reset_index(drop=True),
                               expected_df.reset_index(drop=True))
Esempio n. 14
0
def test_index_metadata(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)
Esempio n. 15
0
def test_store_dataframes_as_dataset(store_factory, metadata_version,
                                     bound_store_dataframes):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df_list = [df.copy(deep=True), df.copy(deep=True)]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        secondary_indices=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    assert "P" in dataset.indices

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    index_dct = stored_dataset.indices["P"].load(store).index_dct
    assert sorted(index_dct.keys()) == list(range(0, 10))

    counter = 0
    for k in store.keys():
        if "parquet" in k and "indices" not in k:
            counter += 1
            df_stored = DataFrameSerializer.restore_dataframe(key=k,
                                                              store=store)
            pdt.assert_frame_equal(df, df_stored)
    assert counter == 2
Esempio n. 16
0
def test_store_dataframes_as_dataset_empty_dataframe(
    store_factory, metadata_version, df_all_types, bound_store_dataframes
):
    """
    Test that writing an empty column succeeds.
    In particular, this may fail due to too strict schema validation.
    """
    df_empty = df_all_types.drop(0)

    # Store a second table with shared columns. All shared columns must be of the same type
    # This may fail in the presence of empty partitions if the schema validation doesn't account for it
    df_shared_cols = df_all_types.loc[:, df_all_types.columns[:3]]
    df_shared_cols["different_col"] = "a"

    assert df_empty.empty
    df_list = [
        {
            "label": "cluster_1",
            "data": [("tableA", df_empty), ("tableB", df_shared_cols.copy(deep=True))],
        },
        {
            "label": "cluster_2",
            "data": [
                ("tableA", df_all_types),
                ("tableB", df_shared_cols.copy(deep=True)),
            ],
        },
    ]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["tableA"], store=store
    )
    pdt.assert_frame_equal(df_empty, df_stored)

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["tableA"], store=store
    )
    # Roundtrips for type date are not type preserving
    df_stored["date"] = df_stored["date"].dt.date
    pdt.assert_frame_equal(df_all_types, df_stored)

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["tableB"], store=store
    )
    pdt.assert_frame_equal(df_shared_cols, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["tableB"], store=store
    )
    pdt.assert_frame_equal(df_shared_cols, df_stored)
Esempio n. 17
0
def test_store_table_to_store(serialiser, store):
    df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["∆", "€"]})
    table = pa.Table.from_pandas(df)
    key = serialiser.store(store, "prefix", table)
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)
Esempio n. 18
0
def test_load_df_from_store_unsupported_format(store):
    with pytest.raises(ValueError):
        DataFrameSerializer.restore_dataframe(store, "test.unknown")
Esempio n. 19
0
def test_filter_query_predicate_exclusion(store):
    with pytest.raises(ValueError):
        DataFrameSerializer.restore_dataframe(store,
                                              "test.parquet",
                                              predicates=[[("a", "==", 1)]],
                                              filter_query="True")
Esempio n. 20
0
    def load_dataframes(
        self,
        store: KeyValueStore,
        columns: Optional[Sequence[str]] = None,
        predicate_pushdown_to_io: bool = True,
        categoricals: Optional[Sequence[str]] = None,
        dates_as_object: bool = True,
        predicates: PredicatesType = None,
    ) -> "MetaPartition":
        """
        Load the dataframes of the partitions from store into memory.

        Parameters
        ----------
        tables
            If a list is supplied, only the given tables of the partition are
            loaded. If the given table does not exist it is ignored.

            Examples

            .. code::

                >>> part = MetaPartition(
                ...     label='part_label'
                ...     files={
                ...         'core': 'core_key_in_store',
                ...         'helper': 'helper_key_in_store'
                ...     }
                ...  )
                >>> part.data
                    {}
                >>> part = part.load_dataframes(store, ['core'])
                >>> part.data
                    {
                        'core': pd.DataFrame()
                    }

        """

        if categoricals is None:
            categoricals = []
        if not dates_as_object:
            warnings.warn(
                "The argument `date_as_object` is set to False. This argument will be deprecated and the future behaviour will be as if the paramere was set to `True`. Please migrate your code accordingly ahead of time.",
                DeprecationWarning,
            )

        LOGGER.debug("Loading internal dataframes of %s", self.label)
        if not self.file:
            # This used to raise, but the specs do not require this, so simply do a no op
            LOGGER.debug("Partition %s is empty and has no data.", self.label)
            return self
        predicates = _combine_predicates(predicates, self.logical_conjunction)
        predicates = _predicates_to_named(predicates)

        dataset_uuid, _, indices, _ = decode_key(self.file)

        # In case the columns only refer to the partition indices, we need to load at least a single column to
        # determine the length of the required dataframe.
        table_columns_to_io = columns

        filtered_predicates = predicates

        self = self.load_schema(dataset_uuid=dataset_uuid, store=store)

        # Filter predicates that would apply to this partition and remove the partition columns
        if predicates:
            # Check if there are predicates that match to the partition columns.
            # For these we need to check if the partition columns already falsify
            # the conditition.
            #
            # We separate these predicates into their index and their Parquet part.
            (
                split_predicates,
                has_index_condition,
            ) = self._split_predicates_in_index_and_content(predicates)

            filtered_predicates = []
            if has_index_condition:
                filtered_predicates = self._apply_partition_key_predicates(
                    indices, split_predicates)
            else:
                filtered_predicates = [
                    pred.content_part for pred in split_predicates
                ]

        # Remove partition_keys from table_columns_to_io
        if self.partition_keys and table_columns_to_io is not None:
            keys_to_remove = set(
                self.partition_keys) & set(table_columns_to_io)
            # This is done to not change the ordering of the list
            table_columns_to_io = [
                c for c in table_columns_to_io if c not in keys_to_remove
            ]

        start = time.time()
        df = DataFrameSerializer.restore_dataframe(
            key=self.file,
            store=store,
            columns=table_columns_to_io,
            categories=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            predicates=filtered_predicates,
            date_as_object=dates_as_object,
        )
        LOGGER.debug("Loaded dataframe %s in %s seconds.", self.file,
                     time.time() - start)
        # Metadata version >=4 parse the index columns and add them back to the dataframe

        df = self._reconstruct_index_columns(
            df=df,
            key_indices=indices,
            columns=columns,
            categories=categoricals,
            date_as_object=dates_as_object,
        )

        df.columns = df.columns.map(ensure_string_type)
        if columns is not None:
            # TODO: When the write-path ensures that all partitions have the same column set, this check can be
            #       moved before `DataFrameSerializer.restore_dataframe`. At the position of the current check we
            #       may want to double check the columns of the loaded DF and raise an exception indicating an
            #       inconsistent dataset state instead.
            missing_cols = set(columns).difference(df.columns)
            if missing_cols:
                raise ValueError(
                    "Columns cannot be found in stored dataframe: {}".format(
                        ", ".join(sorted(missing_cols))))

            if list(df.columns) != columns:
                df = df.reindex(columns=columns, copy=False)

        return self.copy(data=df)
Esempio n. 21
0
def test_store_dataframes_as_dataset_batch_mode(
    store_factory, metadata_version, bound_store_dataframes
):
    values_p1 = [1, 2, 3]
    values_p2 = [4, 5, 6]
    df = pd.DataFrame({"P": values_p1})
    df2 = pd.DataFrame({"P": values_p2})

    df_list = [
        [
            {
                "label": "cluster_1",
                "data": [("core", df)],
                "indices": {
                    "P": ExplicitSecondaryIndex(
                        "P", {v: ["cluster_1"] for v in values_p1}
                    )
                },
            },
            {
                "label": "cluster_2",
                "data": [("core", df2)],
                "indices": {
                    "P": ExplicitSecondaryIndex(
                        "P", {v: ["cluster_2"] for v in values_p2}
                    )
                },
            },
        ]
    ]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store(
        "dataset_uuid", store
    ).load_all_indices(store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["core"], store=store
    )
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["core"], store=store
    )
    pdt.assert_frame_equal(df2, df_stored)

    assert stored_dataset.indices["P"].to_dict() == {
        1: np.array(["cluster_1"], dtype=object),
        2: np.array(["cluster_1"], dtype=object),
        3: np.array(["cluster_1"], dtype=object),
        4: np.array(["cluster_2"], dtype=object),
        5: np.array(["cluster_2"], dtype=object),
        6: np.array(["cluster_2"], dtype=object),
    }