Example #1
0
def compare(first_data_set: DataSet, second_data_set: DataSet) -> tuple:
    """
    Compare given data sets by computing the
    number of metadata entries present in first
    but not in second data set and vice versa.
    Return both values as a tuple of integers.
    Intuitively, both numbers are 0 in case
    of strictly identical metadata structures.
    :param first_data_set: DataSet - Data points
        and corresponding metadata structures.
    :param second_data_set: DataSet - Data points
        and corresponding metadata structures.
    :return: (int, int) - Number of metadata
        elements present in first but not
        in second data set (and vice versa).
    :raise ValueError: In case one
        DataSet instance is missing.
    """
    # Check if first data structure is
    # actually not a DataSet instance.
    if not isinstance(first_data_set, DataSet):
        raise ValueError

    # Check if second data structure is
    # actually not a DataSet instance.
    if not isinstance(second_data_set, DataSet):
        raise ValueError

    # Collect metadata entries of first
    # data set in a single set structure.
    first_metadata = set()
    for i in range(len(first_data_set)):
        metadata = first_data_set.get_metadata_at_index(i)
        first_metadata.update(metadata)

    # Collect metadata entries of second
    # data set in a single set structure.
    second_metadata = set()
    for i in range(len(second_data_set)):
        metadata = second_data_set.get_metadata_at_index(i)
        second_metadata.update(metadata)

    num_unique_elements_in_first_metadata = len(
        first_metadata.difference(second_metadata))
    num_unique_elements_in_second_metadata = len(
        second_metadata.difference(first_metadata))

    return num_unique_elements_in_first_metadata, num_unique_elements_in_second_metadata
Example #2
0
def _get_clustered_data_set(data_set: DataSet, clusters: list) -> DataSet:
    """
    Combine vectors within each cluster to a single
    data point and corresponding metadata structure.
    Use mean vector as the cluster's representative
    and all metadata structures excluding duplicates.
    :param data_set: DataSet - Data points
        and corresponding metadata structures.
    :param clusters: list(list) - Data point
        indices contained in each cluster.
    :return: DataSet - Clustered data points
        and corresponding metadata structures.
    """
    # Create lists to store compressed vectors
    # and corresponding metadata structures.
    new_data_points = list()
    new_metadata = list()

    for index_list in clusters:
        # Compress data points in current cluster
        # by computing corresponding mean vector.
        current_data_points = [
            data_set.get_vector_at_index(i) for i in index_list
        ]
        mean_vector = calculation.get_mean(current_data_points)
        new_data_points.append(mean_vector)

        # Compress metadata structures in current
        # cluster by using corresponding merge function.
        current_metadata_structure = set()
        for i in index_list:
            current_metadata_structure = DataSet.merge_metadata(
                current_metadata_structure, data_set.get_metadata_at_index(i))
        new_metadata.append(current_metadata_structure)

    # Return compressed data.
    return DataSet(new_data_points, new_metadata)