Beispiel #1
0
    def setUpClass(cls):
        # Create folders.
        os.makedirs(os.path.join(source.RETRIEVAL_TESTING_BASE_PATH),
                    exist_ok=True)
        os.makedirs(os.path.join(source.RETRIEVAL_DATA_SUBFOLDER_PATH),
                    exist_ok=True)

        # Write test data.
        storage.write_data_set(
            os.path.join(source.RETRIEVAL_DATA_BASE_PATH,
                         source.FILE_DATA_NAME),
            DataSet(source.FILE_MATRIX, source.FILE_METADATA))

        storage.write_data_set(
            os.path.join(source.RETRIEVAL_DATA_FOLDER_PATH,
                         source.FOLDER_DATA_NAME),
            DataSet(source.FOLDER_MATRIX, source.FOLDER_METADATA))

        storage.write_data_set(
            os.path.join(source.RETRIEVAL_DATA_SUBFOLDER_PATH,
                         source.SUBFOLDER_FIRST_DATA_NAME),
            DataSet(source.SUBFOLDER_FIRST_MATRIX,
                    source.SUBFOLDER_FIRST_METADATA))
        storage.write_data_set(
            os.path.join(source.RETRIEVAL_DATA_SUBFOLDER_PATH,
                         source.SUBFOLDER_SECOND_DATA_NAME),
            DataSet(source.SUBFOLDER_SECOND_MATRIX,
                    source.SUBFOLDER_SECOND_METADATA))
Beispiel #2
0
    def _add_second_batch_of_data_points(self, handler):
        # Execute appending operations in random order.
        operations = [0, 1, 2]
        numpy.random.shuffle(operations)
        for i in operations:
            if i == 0:
                # Append whole directory.
                handler.append_folder(source.RETRIEVAL_DATA_FOLDER_PATH)
            elif i == 1:
                # Append empty data set.
                handler.append(DataSet([], []))
            elif i == 2:
                j = numpy.random.randint(2)
                if j == 0:
                    # Append subdirectory file once again.
                    handler.append_file(
                        os.path.join(source.RETRIEVAL_DATA_SUBFOLDER_PATH,
                                     source.SUBFOLDER_FIRST_DATA_NAME))
                elif j == 1:
                    # Append whole directory once again.
                    handler.append_folder(source.RETRIEVAL_DATA_FOLDER_PATH)

        # Try to append data of wrong dimension.
        self.assertRaises(
            ValueError, handler.append,
            DataSet([source.VECTOR_3D_NUMPY], [source.VECTOR_3D_METADATA]))
Beispiel #3
0
    def _query_first_batch_of_data_points(self, handler):
        query_vector = numpy.array([1.5, 1],
                                   dtype=settings.VECTOR_ENTRY_DATA_TYPE)

        expected_data_points = [
            source.FIRST_VECTOR_2D_NUMPY, source.FIRST_VECTOR_2D_REVERSED_NUMPY
        ]
        expected_metadata = [
            source.FIRST_VECTOR_2D_METADATA,
            source.FIRST_VECTOR_2D_REVERSED_METADATA
        ]
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector, (numpy.sqrt(5) / 2))
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        expected_data_points = [source.FIRST_VECTOR_2D_REVERSED_NUMPY]
        expected_metadata = [source.FIRST_VECTOR_2D_REVERSED_METADATA]
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector,
                                         numpy.sqrt(5) / 2 - 0.001)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        query_vector = numpy.array([5.5, 5.5],
                                   dtype=settings.VECTOR_ENTRY_DATA_TYPE)

        expected_data_points = source.FILE_MATRIX
        expected_metadata = source.FILE_METADATA
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector, (numpy.sqrt(2) / 2))
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        expected_data_points = []
        expected_metadata = []
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector,
                                         (numpy.sqrt(2) / 2) - 0.001)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        retrieved_result = handler.query(query_vector, 0)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))
        retrieved_result = handler.query(query_vector, -1)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        query_vector = source.FIRST_VECTOR_2D_NUMPY

        expected_data_points = [source.FIRST_VECTOR_2D_NUMPY]
        expected_metadata = [source.FIRST_VECTOR_2D_METADATA]
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector, 0)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        self.assertRaises(ValueError, handler.query, source.CUSTOM_STR, 2)
        self.assertRaises(ValueError, handler.query, source.CUSTOM_DICT, 2)
        self.assertRaises(ValueError, handler.query, "", 2)
        self.assertRaises(ValueError, handler.query, None, 2)
        self.assertRaises(ValueError, handler.query, [], 2)
        self.assertRaises(ValueError, handler.query, numpy.empty(0), 2)
Beispiel #4
0
    def test_add_data_set(self):
        # Create empty data set.
        data_set = DataSet([], [])

        # Test invalid data on empty data set.
        self._test_add_data_set_invalid(data_set)

        # Add data sets.
        data_set.add_data_set(DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
        data_set.add_data_set(
            DataSet(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                    source.MATRIX_WITH_DUPLICATES_METADATA_UNEQUAL))
        self.assertIsInstance(data_set.data_points, list)
        self.assertIsInstance(data_set.metadata, list)
        result_matrix = source.MATRIX_LIST_OF_NUMPYS + source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS
        result_metadata = source.MATRIX_METADATA + source.MATRIX_WITH_DUPLICATES_METADATA_UNEQUAL
        numpy.testing.assert_allclose(data_set.data_points, result_matrix)
        self.assertListEqual(data_set.metadata, result_metadata)

        # Test invalid data on non-empty data set.
        self._test_add_data_set_invalid(data_set)

        # Add data set of wrong shape.
        new_data_set = DataSet([source.VECTOR_3D_NUMPY], [source.VECTOR_3D_METADATA])
        self.assertRaises(ValueError, data_set.add_data_set, new_data_set)
Beispiel #5
0
    def _test_init_valid(self, new_data_points, new_metadata):
        # Test with metadata.
        data_set = DataSet(new_data_points, new_metadata)
        self.assertIsInstance(data_set.data_points, list)
        self.assertIsInstance(data_set.metadata, list)
        numpy.testing.assert_allclose(data_set.data_points, new_data_points)

        # Test without metadata.
        data_set = DataSet(new_data_points, [])
        self.assertIsInstance(data_set.data_points, list)
        self.assertIsInstance(data_set.metadata, list)
        self.assertListEqual(data_set.metadata, [set() for _i in range(len(new_data_points))])
Beispiel #6
0
 def process_file(self,
                  file_path: str,
                  num_skipped_frames: int = -1) -> DataSet:
     """
     Extract all faces in given image or video file
     and compute corresponding description vectors.
     :param file_path: str - Path to image or video file.
     :param num_skipped_frames: int - Number of skipped
         frames between two consecutive samples. In case
         of a negative number, this value is adjusted to
         ensure a sample rate of one sample per second.
         Is ignored in case of an image (default: -1).
     :return: DataSet - Data points and
         corresponding metadata structures.
         Return empty data set if no vectors
         could be retrieved.
     """
     # Make sure that given file path is an
     # absolute path and not a relative one.
     file_path = os.path.abspath(file_path)
     # Check if given file is an image or a video.
     if image.is_valid_image_file(file_path):
         vector_list, metadata_list = self._process_image(file_path)
         new_data = DataSet(vector_list, metadata_list)
         return new_data
     elif video.is_valid_video_file(file_path):
         # A negative number of skipped frames
         # between two consecutive samples is
         # invalid. In this case, use current
         # frame rate to sample once a second.
         if num_skipped_frames < 0:
             frame_rate = video.get_frame_rate(file_path)
             # Check if frame rate could be retrieved.
             if frame_rate > 0:
                 # Frame rate could be retrieved.
                 num_skipped_frames = frame_rate - 1
             else:
                 # Frame rate is invalid, so fall back
                 # to default defined in settings module.
                 settings.LOGGER.warning(
                     "Invalid number of frames between two samples. Use {} instead."
                     .format(settings.NUM_SKIPPED_FRAMES))
                 num_skipped_frames = settings.NUM_SKIPPED_FRAMES
         vector_list, metadata_list = self._process_video(
             file_path, num_skipped_frames)
         new_data_set = DataSet(vector_list, metadata_list)
         return new_data_set
     else:
         # Error: file is invalid.
         settings.LOGGER.warning("Invalid file: {}".format(file_path))
         return DataSet(list(), list())
Beispiel #7
0
def compare(first_data_set: DataSet, second_data_set: DataSet) -> tuple:
    """
    Compare given data sets by computing the
    number of metadata entries present in first
    but not in second data set and vice versa.
    Return both values as a tuple of integers.
    Intuitively, both numbers are 0 in case
    of strictly identical metadata structures.
    :param first_data_set: DataSet - Data points
        and corresponding metadata structures.
    :param second_data_set: DataSet - Data points
        and corresponding metadata structures.
    :return: (int, int) - Number of metadata
        elements present in first but not
        in second data set (and vice versa).
    :raise ValueError: In case one
        DataSet instance is missing.
    """
    # Check if first data structure is
    # actually not a DataSet instance.
    if not isinstance(first_data_set, DataSet):
        raise ValueError

    # Check if second data structure is
    # actually not a DataSet instance.
    if not isinstance(second_data_set, DataSet):
        raise ValueError

    # Collect metadata entries of first
    # data set in a single set structure.
    first_metadata = set()
    for i in range(len(first_data_set)):
        metadata = first_data_set.get_metadata_at_index(i)
        first_metadata.update(metadata)

    # Collect metadata entries of second
    # data set in a single set structure.
    second_metadata = set()
    for i in range(len(second_data_set)):
        metadata = second_data_set.get_metadata_at_index(i)
        second_metadata.update(metadata)

    num_unique_elements_in_first_metadata = len(
        first_metadata.difference(second_metadata))
    num_unique_elements_in_second_metadata = len(
        second_metadata.difference(first_metadata))

    return num_unique_elements_in_first_metadata, num_unique_elements_in_second_metadata
Beispiel #8
0
    def _add_first_batch_of_data_points(self, handler):
        # Execute appending operations in random order.
        operations = [0, 1, 2, 3, 4]
        numpy.random.shuffle(operations)
        for i in operations:
            if i == 0:
                # Append single file.
                handler.append_file(
                    os.path.join(source.RETRIEVAL_DATA_BASE_PATH,
                                 source.FILE_DATA_NAME))
            elif i == 1:
                # Append data points currently in memory.
                handler.append(
                    DataSet(source.MATRIX_LIST_OF_NUMPYS,
                            source.MATRIX_METADATA))
            elif i == 2:
                # Append empty data set.
                handler.append(DataSet([], []))
            elif i == 3:
                # Append invalid data.
                self.assertRaises(ValueError, handler.append_file,
                                  source.CUSTOM_DICT)
                self.assertRaises(ValueError, handler.append_file,
                                  source.CUSTOM_STR)
                self.assertRaises(ValueError, handler.append_file, "")
                self.assertRaises(ValueError, handler.append_file, None)
                self.assertRaises(ValueError, handler.append_folder,
                                  source.CUSTOM_DICT)
                self.assertRaises(ValueError, handler.append_folder,
                                  source.CUSTOM_STR)
                self.assertRaises(ValueError, handler.append_folder, "")
                self.assertRaises(ValueError, handler.append_folder, None)
                self.assertRaises(ValueError, handler.append,
                                  source.CUSTOM_DICT)
                self.assertRaises(ValueError, handler.append,
                                  source.CUSTOM_STR)
                self.assertRaises(ValueError, handler.append, "")
                self.assertRaises(ValueError, handler.append, None)
            elif i == 4:
                # Append single file once again.
                handler.append_file(
                    os.path.join(source.RETRIEVAL_DATA_BASE_PATH,
                                 source.FILE_DATA_NAME))

        # Try to append data of wrong dimension.
        self.assertRaises(
            ValueError, handler.append,
            DataSet([source.VECTOR_3D_NUMPY], [source.VECTOR_3D_METADATA]))
Beispiel #9
0
    def _test_add_data_set_invalid(self, data_set: DataSet):
        # Store old data set.
        old_data_points = numpy.copy(data_set.data_points)
        old_metadata = copy.deepcopy(data_set.metadata)

        # Test empty matrices.
        data_set.add_data_set(DataSet([], []))
        self.assertIsInstance(data_set.data_points, list)
        self.assertIsInstance(data_set.metadata, list)
        numpy.testing.assert_allclose(data_set.data_points, old_data_points)
        self.assertListEqual(data_set.metadata, old_metadata)

        # Test other data types.
        self.assertRaises(ValueError, data_set.add_data_set, source.CUSTOM_DICT)
        self.assertRaises(ValueError, data_set.add_data_set, source.CUSTOM_STR)
        self.assertRaises(ValueError, data_set.add_data_set, None)
Beispiel #10
0
def get_all_data_sets(root_dir: str) -> DataSet:
    """
    Retrieve all data points in specified
    directory (or its subdirectories).
    :param root_dir: str - Path to directory.
    :return: DataSet - Retrieved data points
        and corresponding metadata structures.
    """
    # Get all data set file paths in
    # specified directory (or its subdirectories).
    data_set_file_paths = get_all_data_set_file_paths(root_dir)
    # Retrieve corresponding data sets and
    # concatenate them to a single DataSet instance.
    new_data_set = DataSet([], [])
    for file_path in data_set_file_paths:
        current_data_set = read_data_set(file_path)
        new_data_set.add_data_set(current_data_set)
    return new_data_set
    def _test_cluster(self,
                      source_matrix,
                      source_metadata,
                      threshold,
                      target_matrix,
                      target_metadata):
        # Test data as it is.
        given_data_set = DataSet(source_matrix, source_metadata)
        received_data_set = clustering.cluster(given_data_set, threshold)
        self.assertIsInstance(received_data_set, DataSet)
        numpy.testing.assert_allclose(received_data_set.data_points, target_matrix)
        for i in range(len(received_data_set)):
            self.assertSetEqual(received_data_set.get_metadata_at_index(i), target_metadata[i])

        # Test empty metadata.
        received_data_set = clustering.cluster(DataSet(source_matrix, []), threshold)
        self.assertIsInstance(received_data_set, DataSet)
        numpy.testing.assert_allclose(received_data_set.data_points, target_matrix)
        for i in range(len(received_data_set)):
            self.assertSetEqual(received_data_set.get_metadata_at_index(i), set())
Beispiel #12
0
    def _test_deduplicate(self, source_matrix, source_metadata, target_matrix,
                          target_metadata, sort_by_memory_address):
        source_data_set = DataSet(source_matrix, source_metadata)
        copy_source_data_set = copy.deepcopy(source_data_set)

        target_data_set = DataSet(target_matrix, target_metadata)

        result_data_set = deduplication.deduplicate(
            copy_source_data_set,
            sort_by_memory_address=sort_by_memory_address)

        if len(source_data_set) > 0:
            self.assertIsNot(copy_source_data_set, result_data_set)
            self.assertIsNot(copy_source_data_set.data_points,
                             result_data_set.data_points)
            self.assertIsNot(copy_source_data_set.metadata,
                             result_data_set.metadata)
            self.assertTrue(
                comparison.is_equal(target_data_set, result_data_set))
        else:
            self.assertEqual(len(result_data_set), 0)
Beispiel #13
0
def _deduplicate_by_sorting(data_set: DataSet, callable_comparator) -> DataSet:
    """
    Find and remove duplicates in given data
    set merging corresponding metadata structures.
    Sort data set with specified comparator in
    order to identify duplicated entries.
    :param data_set: DataSet - Data points and
        corresponding metadata structures.
    :param callable_comparator: Callable
        comparator function.
    :return: DataSet - Data set without duplicates.
    """
    # Use Python's zip() function to concatenate the vector
    # and metadata lists to a single list of tuples
    # mapping each vector to its metadata structure.
    data = zip(data_set.data_points, data_set.metadata)
    # Sort this list using specified comparator.
    sorted_data = sorted(data, key=functools.cmp_to_key(callable_comparator))
    # Create new lists to store unique data
    # points and corresponding metadata structures.
    new_data_points = list()
    new_metadata = list()
    # Iterate through sorted list
    # considering equal entries only once.
    last_entry = None
    for entry in sorted_data:
        if last_entry is None or callable_comparator(entry, last_entry) != 0:
            # Current data point was never seen before,
            # so append it to list of unique data points.
            new_data_points.append(entry[0])
            new_metadata.append(entry[1])
            last_entry = entry
        else:
            # Current data point is already present,
            # therefore only merge its metadata.
            new_metadata[-1] = DataSet.merge_metadata(new_metadata[-1], entry[1])
    # Return data set without duplicates.
    return DataSet(new_data_points, new_metadata)
Beispiel #14
0
def _get_clustered_data_set(data_set: DataSet, clusters: list) -> DataSet:
    """
    Combine vectors within each cluster to a single
    data point and corresponding metadata structure.
    Use mean vector as the cluster's representative
    and all metadata structures excluding duplicates.
    :param data_set: DataSet - Data points
        and corresponding metadata structures.
    :param clusters: list(list) - Data point
        indices contained in each cluster.
    :return: DataSet - Clustered data points
        and corresponding metadata structures.
    """
    # Create lists to store compressed vectors
    # and corresponding metadata structures.
    new_data_points = list()
    new_metadata = list()

    for index_list in clusters:
        # Compress data points in current cluster
        # by computing corresponding mean vector.
        current_data_points = [
            data_set.get_vector_at_index(i) for i in index_list
        ]
        mean_vector = calculation.get_mean(current_data_points)
        new_data_points.append(mean_vector)

        # Compress metadata structures in current
        # cluster by using corresponding merge function.
        current_metadata_structure = set()
        for i in index_list:
            current_metadata_structure = DataSet.merge_metadata(
                current_metadata_structure, data_set.get_metadata_at_index(i))
        new_metadata.append(current_metadata_structure)

    # Return compressed data.
    return DataSet(new_data_points, new_metadata)
Beispiel #15
0
    def _query_second_batch_of_data_points(self, handler):
        query_vector = numpy.array([7.5, 7.5],
                                   dtype=settings.VECTOR_ENTRY_DATA_TYPE)

        expected_data_points = source.FOLDER_MATRIX
        expected_metadata = source.FOLDER_METADATA
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector, (numpy.sqrt(2) / 2))
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        expected_data_points = []
        expected_metadata = []
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector,
                                         (numpy.sqrt(2) / 2) - 0.001)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))

        query_vector = source.SUBFOLDER_SECOND_MATRIX[0]

        expected_data_points = source.SUBFOLDER_SECOND_MATRIX
        expected_metadata = source.SUBFOLDER_SECOND_METADATA
        expected_result = DataSet(expected_data_points, expected_metadata)
        retrieved_result = handler.query(query_vector, 0)
        self.assertTrue(comparison.is_equal(expected_result, retrieved_result))
Beispiel #16
0
    def _process_image(self, img_file_path: str) -> tuple:
        """
        Extract all faces in given image file and
        compute corresponding description vectors.
        :param img_file_path: str - Path to image file.
        :return: (list, list) - List of data
            points and another list of
            corresponding metadata structures.
            Return empty lists in case no
            description vectors could be retrieved.
        """
        # Read specified image file.
        img_bgr = image.read_image_from_file(img_file_path)
        if img_bgr.size < 1:
            # Error: image file could not be
            # read, therefore return empty lists
            # to indicate that no description
            # vectors could be retrieved.
            return list(), list()

        # Convert retrieved image to RGB color encoding.
        img_rgb = image.swap_color_encoding(img_bgr)
        # Get description vector and
        # bounding box coordinates of all
        # detected faces in current image.
        face_descriptions = self._get_face_descriptions(img_rgb)
        # Create empty list to store description vectors.
        vector_list = list()
        # Create empty list to store metadata structures.
        metadata_list = list()
        # Fill both lists with corresponding data of detected faces.
        for face_data_tuple in face_descriptions:
            # Extract and store description vector of current face.
            vector = face_data_tuple[0]
            vector_list.append(vector)
            # Extract and store metadata of current face.
            top_left = face_data_tuple[1]
            bottom_right = face_data_tuple[2]
            # Create metadata structure for current
            # face in image file. In this case, this
            # is a tuple containing file name and top
            # left / bottom right bounding box corners.
            img_metadata = DataSet.create_metadata(
                os.path.basename(img_file_path), top_left, bottom_right)
            metadata_list.append(img_metadata)
        return vector_list, metadata_list
Beispiel #17
0
def read_data_set(storage_file_path: str) -> DataSet:
    """
    Read DataSet instance from storage folder.
    Use given file path and append ".npy" for
    matrix data and ".dat" for corresponding
    metadata before reading it from defined
    directory.
    :param storage_file_path: str - Path to storage
        file (ignoring file name extension).
    :return: DataSet - Retrieved data points
        and corresponding metadata.
        Return empty DataSet instance
        in case of an error.
    """
    # Check if specified path is not a string.
    if not isinstance(storage_file_path, str):
        return DataSet(list(), list())

    # Check if specified path is actually a directory.
    if os.path.isdir(storage_file_path):
        return DataSet(list(), list())

    # Create path to data point file.
    data_point_file_path = storage_file_path + DATA_POINT_FILE_NAME_EXTENSION
    # Check if specified file exists.
    if not os.path.isfile(data_point_file_path):
        # Error: data points file does not exist,
        # therefore return empty DataSet instance.
        return DataSet(list(), list())
    # Read data points.
    data_points = read_data_structure_from_file(data_point_file_path,
                                                use_numpy=True)
    if data_points is None:
        # Error: data point file could not be read.
        return DataSet([], [])
    else:
        # Data points could be read,
        # so convert them to a list.
        data_points = list(data_points)

    # Create path to metadata file.
    metadata_file_path = storage_file_path + METADATA_FILE_NAME_EXTENSION
    # Read metadata (if possible).
    metadata = read_data_structure_from_file(metadata_file_path)
    if metadata is None:
        # Error: metadata file could not be read.
        return DataSet(data_points, list())

    # Return retrieved data set.
    return DataSet(data_points, metadata)
Beispiel #18
0
def _chinese_whispers_clustering(data_set: DataSet,
                                 distance_threshold: float,
                                 num_iterations: int = 30) -> DataSet:
    """
    Cluster given data set in a way that only a
    single data point within groups of nearby points
    is kept. Use a temporary graph structure with
    one node per data point and add an unweighted
    edge between every pair of nodes with specified
    maximum distance. Search for groups of nodes
    with many pairwise edges using chinese whispers
    graph clustering. This algorithm works in three
    steps:
        1. Assign each node a different label.
        2. Iterate through all nodes assigning
            each node the label used for the
            majority of its neighbors.
        3. Repeat second step a specified number
            of times (use 30 iterations as
            indicated in original paper).
    See Chris Biemann: "Chinese Whispers - an Efficient Graph
    Clustering Algorithm and its Application to Natural
    Language Processing Problems" (2006) for details.
    :param data_set: DataSet - Data points
        and corresponding metadata structures.
    :param distance_threshold: float - Maximum
        distance between two data points in
        order to consider them as candidates
        for the same cluster.
    :param num_iterations: int - Maximum number of
        passes through all data points (default: 30).
    :return: DataSet - Clustered data points
        and corresponding metadata structures.

    Compare with dlib implementation:
    http://dlib.net/python/index.html#dlib.chinese_whispers_clustering
    """
    # Create adjacency list representation
    # of given data set only considering
    # edges connecting data points within
    # specified distance threshold.
    edges = [[] for _i in range(len(data_set))]
    for i in range(len(data_set)):
        for j in range(i + 1, len(data_set)):
            # Get distance from vector at
            # index i to vector at index j.
            distance = calculation.get_distance(
                data_set.get_vector_at_index(i),
                data_set.get_vector_at_index(j))
            # Compare current distance to specified threshold.
            if numpy.less_equal(distance, distance_threshold):
                edges[i].append(j)
                edges[j].append(i)

    # Initialize clustering labels so that every
    # data point is assigned a different cluster.
    labels = [i for i in range(len(data_set))]

    # Initialize list to specify current data
    # point sequence, i.e. the order in which
    # data point labels are adjusted.
    sequence = [i for i in range(len(data_set))]

    for _i in range(num_iterations):
        # Create flag to track if
        # any label was adjusted.
        label_adjusted = False
        # Create random ordering of data points.
        numpy.random.shuffle(sequence)
        # Iterate through data points in this order.
        label_counter = dict()
        for current_index in sequence:
            # Count all occurrences of the
            # same label in direct neighborhood
            # of currently chosen data point.
            label_counter.clear()
            for neighbor_index in edges[current_index]:
                neighbor_label = labels[neighbor_index]
                if neighbor_label in label_counter:
                    label_counter[neighbor_label] += 1
                else:
                    label_counter[neighbor_label] = 1
            # Get the most common label.
            best_label = -1
            best_count = -1
            for label, count in label_counter.items():
                if count > best_count:
                    best_label = label
                    best_count = count
            # Assign most common label to current data
            # point only if it exists and is different.
            if (best_label >= 0) and (best_label != labels[current_index]):
                labels[current_index] = best_label
                label_adjusted = True
        # Check if any label was adjusted. If not, the
        # process has converged and can be stopped.
        if not label_adjusted:
            break

    # Reconstruct corresponding clusters.
    clusters = dict()
    for i, label in enumerate(labels):
        if label in clusters:
            clusters[label].append(i)
        else:
            clusters[label] = [i]
    clusters = list(clusters.values())

    # Return data set containing
    # one representative description
    # vector with corresponding metadata
    # structures for each cluster.
    return _get_clustered_data_set(data_set, clusters)
Beispiel #19
0
    def _process_video(self, video_file_path: str,
                       num_skipped_frames: int) -> tuple:
        """
        Extract all faces in given video file and
        compute corresponding description vectors.
        :param video_file_path: str - Path to video file.
        :param num_skipped_frames: int - Number of skipped
            frames between two consecutive samples.
        :return: (list, list) - List of data
            points and another list of
            corresponding metadata structures.
            Return empty lists in case no
            description vectors could be retrieved.
        """
        # Open video file.
        cap = cv2.VideoCapture(video_file_path)
        # Adjust given number of frames between
        # two samples to indicate the number of
        # necessary forward steps in video file.
        num_skipped_frames += 1
        # Create empty list to store description vectors.
        vector_list = list()
        # Create empty list to store metadata structures.
        metadata_list = list()
        # Create list to store description vectors
        # of last sample. Should be used as cache
        # to avoid multiple insertions of description
        # vectors corresponding to the same person.
        vectors_in_last_sample = list()
        # Read frames with specified sample rate.
        next_frame_index = 0
        ret = True
        while ret:
            # Jump to next frame index.
            cap.set(cv2.CAP_PROP_POS_FRAMES, next_frame_index)
            # Read frame.
            ret, frame_bgr = cap.read()
            # Check if retrieved pixel data is valid.
            if frame_bgr is None:
                # Error: current frame is invalid,
                # therefore skip it and continue
                # with next sample (if possible).
                continue

            # Convert retrieved frame to RGB color encoding.
            frame_rgb = image.swap_color_encoding(frame_bgr)
            # Get current frame number and convert
            # it to corresponding frame index.
            frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
            frame_index = frame_number - 1
            # Get description vector of all
            # detected faces in current frame.
            face_descriptions = self._get_face_descriptions(frame_rgb)
            # Fill description vector and metadata lists
            # with corresponding data of detected faces.
            for face_tuple in face_descriptions:
                # Extract description vector of current face.
                vector = face_tuple[0]
                # Compare current vector to those detected in last
                # frame. When its distance exceeds a certain threshold,
                # assume a new face which needs to be stored.
                present_in_last_sample = False
                for last_vector in vectors_in_last_sample:
                    distance = calculation.get_distance(last_vector, vector)
                    if numpy.less_equal(
                            distance, settings.DISTANCE_THRESHOLD_RECOGNITION):
                        present_in_last_sample = True
                        break
                if not present_in_last_sample:
                    # Store description vector of current face.
                    vector_list.append(vector)
                    # Extract and store corresponding metadata.
                    top_left = face_tuple[1]
                    bottom_right = face_tuple[2]
                    # Create metadata structure for current face in
                    # specified video file frame. In this case, this
                    # is a tuple containing file name, frame index
                    # and top left / bottom right bounding box corners.
                    video_metadata = DataSet.create_metadata(
                        os.path.basename(video_file_path), frame_index,
                        top_left, bottom_right)
                    metadata_list.append(video_metadata)
            # Update description vectors of last
            # frame to those currently retrieved.
            vectors_in_last_sample.clear()
            for vector, _top_left, _bottom_right in face_descriptions:
                vectors_in_last_sample.append(vector)
            # Update frame index.
            next_frame_index += num_skipped_frames
        # Close video file.
        cap.release()
        return vector_list, metadata_list
    def test_cluster(self):
        # Test valid matrices and metadata.
        self._test_cluster(source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA,
                           5,
                           source.MATRIX_ONE_CLUSTER_NUMPY,
                           source.MATRIX_ONE_CLUSTER_METADATA)
        self._test_cluster(source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA,
                           2,
                           source.MATRIX_TWO_CLUSTER_NUMPY,
                           source.MATRIX_TWO_CLUSTER_METADATA)
        self._test_cluster(source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA,
                           0.5,
                           source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA)

        # Test matrix with duplicates.
        self._test_cluster(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_METADATA_EQUAL,
                           5,
                           source.MATRIX_WITH_DUPLICATES_ONE_CLUSTER_NUMPY,
                           source.MATRIX_ONE_CLUSTER_METADATA)
        self._test_cluster(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_METADATA_EQUAL,
                           2,
                           source.MATRIX_WITH_DUPLICATES_TWO_CLUSTER_NUMPY,
                           source.MATRIX_TWO_CLUSTER_METADATA)
        self._test_cluster(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_METADATA_EQUAL,
                           0.5,
                           source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA)

        self._test_cluster(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_METADATA_UNEQUAL,
                           5,
                           source.MATRIX_WITH_DUPLICATES_ONE_CLUSTER_NUMPY,
                           source.MATRIX_WITH_DUPLICATES_ONE_CLUSTER_METADATA_UNEQUAL)
        self._test_cluster(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_METADATA_UNEQUAL,
                           2,
                           source.MATRIX_WITH_DUPLICATES_TWO_CLUSTER_NUMPY,
                           source.MATRIX_WITH_DUPLICATES_TWO_CLUSTER_METADATA_UNEQUAL)
        self._test_cluster(source.MATRIX_WITH_DUPLICATES_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_METADATA_UNEQUAL,
                           0.5,
                           source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_WITH_DUPLICATES_FOUR_CLUSTER_METADATA_UNEQUAL)

        # Test matrix containing only one vector.
        self._test_cluster(source.MATRIX_ONE_ENTRY_LIST_OF_NUMPYS,
                           source.MATRIX_ONE_ENTRY_METADATA,
                           0.5,
                           source.MATRIX_ONE_ENTRY_LIST_OF_NUMPYS,
                           source.MATRIX_ONE_ENTRY_METADATA)

        # Test empty matrix.
        self._test_cluster([], [], 0.5, [], [])

        # Test case of invalid distance threshold.
        self._test_cluster(source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA,
                           0,
                           source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA)
        self._test_cluster(source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA,
                           -1,
                           source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA)
        self._test_cluster(source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA,
                           numpy.NAN,
                           source.MATRIX_LIST_OF_NUMPYS,
                           source.MATRIX_METADATA)

        # Test other data types and None values.
        self.assertRaises(TypeError, clustering.cluster,
                          source.CUSTOM_DICT,
                          0.5)
        self.assertRaises(TypeError, clustering.cluster,
                          source.CUSTOM_STR,
                          0.5)
        self.assertRaises(TypeError, clustering.cluster,
                          None,
                          0.5)
        self.assertRaises(TypeError, clustering.cluster,
                          DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA),
                          source.CUSTOM_DICT)
        self.assertRaises(TypeError, clustering.cluster,
                          DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA),
                          source.CUSTOM_STR)
        self.assertRaises(TypeError, clustering.cluster,
                          DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA),
                          None)
Beispiel #21
0
    def test_read_and_write_data_set(self):
        # Define test file name.
        test_file_path = os.path.join(source.BASE_PATH, "data_set_test.txt")

        # Write DataSet instance and read it again.
        res = storage.write_data_set(
            test_file_path,
            DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
        self.assertTrue(res)
        data_set = storage.read_data_set(test_file_path)
        numpy.testing.assert_allclose(data_set.data_points,
                                      source.MATRIX_LIST_OF_NUMPYS)
        self.assertListEqual(data_set.metadata, source.MATRIX_METADATA)

        # Write DataSet instance with only one entry and read it again.
        res = storage.write_data_set(
            test_file_path,
            DataSet(source.MATRIX_ONE_ENTRY_LIST_OF_NUMPYS,
                    source.MATRIX_ONE_ENTRY_METADATA))
        self.assertTrue(res)
        data_set = storage.read_data_set(test_file_path)
        numpy.testing.assert_allclose(data_set.data_points,
                                      source.MATRIX_ONE_ENTRY_LIST_OF_NUMPYS)
        self.assertListEqual(data_set.metadata,
                             source.MATRIX_ONE_ENTRY_METADATA)

        # Write empty DataSet and read it again.
        res = storage.write_data_set(test_file_path, DataSet([], []))
        self.assertTrue(res)
        data_set = storage.read_data_set(test_file_path)
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])

        # Read non-existing file.
        data_set = storage.read_data_set(
            os.path.join(source.BASE_PATH, source.NON_EXISTING_FILE))
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])

        # Read non-readable file.
        data_set = storage.read_data_set(
            os.path.join(source.BASE_PATH, source.NON_READABLE_FILE))
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])

        # Read directory.
        data_set = storage.read_data_set(source.BASE_PATH)
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])

        # Write non-writable file.
        if os.geteuid() == 0:
            # Root is always able to read and write data sets.
            res = storage.write_data_set(
                os.path.join(source.BASE_PATH, source.NON_WRITABLE_FILE),
                DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
            self.assertTrue(res)
            data_set = storage.read_data_set(
                os.path.join(source.BASE_PATH, source.NON_WRITABLE_FILE))
            numpy.testing.assert_allclose(data_set.data_points,
                                          source.MATRIX_LIST_OF_NUMPYS)
            self.assertListEqual(data_set.metadata, source.MATRIX_METADATA)
        else:
            res = storage.write_data_set(
                os.path.join(source.BASE_PATH, source.NON_WRITABLE_FILE),
                DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
            self.assertFalse(res)

        # Write directory.
        res = storage.write_data_set(
            source.BASE_PATH,
            DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
        self.assertFalse(res)

        # Write None value.
        res = storage.write_data_set(test_file_path, None)
        self.assertFalse(res)

        # Read invalid file paths.
        data_set = storage.read_data_set(source.CUSTOM_DICT)
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])
        data_set = storage.read_data_set(source.CUSTOM_STR)
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])
        data_set = storage.read_data_set("")
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])
        data_set = storage.read_data_set(None)
        numpy.testing.assert_allclose(data_set.data_points, numpy.empty(0))
        self.assertListEqual(data_set.metadata, [])

        # Write invalid file paths.
        res = storage.write_data_set(
            source.CUSTOM_DICT,
            DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
        self.assertFalse(res)
        res = storage.write_data_set(
            "", DataSet(source.MATRIX_LIST_OF_NUMPYS, source.MATRIX_METADATA))
        self.assertFalse(res)
        res = storage.write_data_set(
            None, DataSet(source.MATRIX_LIST_OF_NUMPYS,
                          source.MATRIX_METADATA))
        self.assertFalse(res)

        # Delete test file.
        os.remove(test_file_path + storage.DATA_POINT_FILE_NAME_EXTENSION)
        os.remove(test_file_path + storage.METADATA_FILE_NAME_EXTENSION)
Beispiel #22
0
    def test_add_data_point(self):
        # Create empty data set.
        data_set_2d = DataSet([], [])

        # Test invalid data on empty data set.
        self._test_add_data_point_invalid(data_set_2d)

        # Add 2D data points to initially empty DataSet.
        data_set_2d.add_data_point(source.FIRST_VECTOR_2D_NUMPY, source.FIRST_VECTOR_2D_METADATA)
        data_set_2d.add_data_point(source.FIRST_VECTOR_2D_REVERSED_NUMPY, source.FIRST_VECTOR_2D_REVERSED_METADATA)
        data_set_2d.add_data_point(source.SECOND_VECTOR_2D_NUMPY, source.SECOND_VECTOR_2D_METADATA)
        data_set_2d.add_data_point(source.SECOND_VECTOR_2D_REVERSED_NUMPY, source.SECOND_VECTOR_2D_REVERSED_METADATA)
        self.assertIsInstance(data_set_2d.data_points, list)
        self.assertIsInstance(data_set_2d.metadata, list)
        numpy.testing.assert_allclose(data_set_2d.data_points, source.MATRIX_LIST_OF_NUMPYS)
        self.assertListEqual(data_set_2d.metadata, source.MATRIX_METADATA)

        # Test invalid data on non-empty data set.
        self._test_add_data_point_invalid(data_set_2d)

        # Add data point of wrong shape.
        self.assertRaises(ValueError, data_set_2d.add_data_point, source.VECTOR_3D_NUMPY, source.VECTOR_3D_METADATA)

        # Add the same 3D data point multiple
        # times to initially empty data set.
        data_set_3d = DataSet([], [])
        data_set_3d.add_data_point(source.VECTOR_3D_NUMPY, source.VECTOR_3D_METADATA)
        data_set_3d.add_data_point(source.VECTOR_3D_NUMPY, source.VECTOR_3D_METADATA)
        data_set_3d.add_data_point(source.VECTOR_3D_NUMPY, source.VECTOR_3D_METADATA)
        self.assertIsInstance(data_set_3d.data_points, list)
        self.assertIsInstance(data_set_3d.metadata, list)
        result_matrix = [source.VECTOR_3D_NUMPY, source.VECTOR_3D_NUMPY, source.VECTOR_3D_NUMPY]
        result_metadata = [source.VECTOR_3D_METADATA, source.VECTOR_3D_METADATA, source.VECTOR_3D_METADATA]
        numpy.testing.assert_allclose(data_set_3d.data_points, result_matrix)
        self.assertListEqual(data_set_3d.metadata, result_metadata)