Example #1
0
def test_select_k_nearest_neighbors_numpy_matrix():
    matrix = numpy.array([(0,), (1,), (2,)])
    center = numpy.array([0])

    assert set(knn.select_k_nearest_neighbors(matrix, center, 2)) == set([0, 1])
    assert set(knn.select_k_nearest_neighbors(matrix, center, 1)) == set([0])

    matrix = numpy.array([(0, 0), (1, 1), (2, 2)])
    center = numpy.array([0, 0])

    assert set(knn.select_k_nearest_neighbors(matrix, center, 2)) == set([0, 1])
    assert set(knn.select_k_nearest_neighbors(matrix, center, 3)) == set([0, 1, 2])
Example #2
0
def test_select_k_nearest_neighbors():
    matrix = [(0,), (1,), (2,)]
    center = [0]

    assert set(knn.select_k_nearest_neighbors(matrix, center, 2)) == set([0, 1])
    assert set(knn.select_k_nearest_neighbors(matrix, center, 1)) == set([0])

    matrix = [(0, 0), (1, 1), (2, 2)]
    center = [0, 0]

    assert set(knn.select_k_nearest_neighbors(matrix, center, 2)) == set([0, 1])
    assert set(knn.select_k_nearest_neighbors(matrix, center, 3)) == set([0, 1, 2])
Example #3
0
def clean_dataset_depuration(input_matrix, target_matrix, k=3, k_prime=2):
    """Clean a dataset with the Depuration procedure.

    See section 3.1 of "Analysis of new techniques to obtain quality training sets".
    """
    if not isinstance(input_matrix, numpy.ndarray):
        input_matrix = numpy.array(input_matrix)
    if not isinstance(target_matrix, numpy.ndarray):
        target_matrix = numpy.array(target_matrix)

    patterns = zip(input_matrix, target_matrix)
    indices = range(len(input_matrix))

    if not ((k + 1) / 2 <= k_prime and k_prime <= k):
        raise ValueError('k_prime must be between (k + 1) / 2 and k')

    kept_inputs = []
    kept_targets = []
    changed_patterns = []
    removed_patterns = []
    for i in indices:
        # Find k-NN of patternIi in patterns - {pattern_i}
        # We do this by finding k+1 nearest indices, and ignoring index i
        k_nearest = knn.select_k_nearest_neighbors(input_matrix,
                                                   input_matrix[i], k + 1)
        k_nearest.remove(i)

        # if a class has at least k_prime representatives
        # among the k neighbours
        class_counts = _count_classes(target_matrix[k_nearest])
        removed = True
        for class_, count in class_counts.iteritems():
            if count >= k_prime:
                # Change the label of pattern to that class
                # and add pattern to cleaned_patterns

                # Keep input
                # and the common class
                kept_inputs.append(input_matrix[i])
                kept_targets.append(class_)

                # Check if new pattern is different than old pattern, track changed
                if class_ != tuple(target_matrix[i]):
                    changed_patterns.append(i)

                removed = False
                break
        if removed:
            # discard pattern (do not add to kept lists)
            removed_patterns.append(i)

    return ((numpy.array(kept_inputs), numpy.array(kept_targets)),
            changed_patterns, removed_patterns)