Beispiel #1
0
def test_kdd_tree_mixed():
    stream = RandomTreeGenerator(tree_random_state=1,
                                 sample_random_state=1,
                                 n_num_features=0)
    stream.prepare_for_use()

    X, _ = stream.next_sample(1000)
    X_test, _ = stream.next_sample(10)

    # Build tree
    cat_features = [i for i in range(25)]
    kdtree = KDTree(X,
                    metric='mixed',
                    return_distance=True,
                    categorical_list=cat_features)

    # Query tree
    dist, idx = kdtree.query(X_test, 4)

    expected_idx = [[123, 234, 707, 654], [688, 429, 216, 627],
                    [463, 970, 566, 399], [18, 895, 640, 996],
                    [396, 612, 897, 232], [328, 54, 138, 569],
                    [253, 501, 82, 273], [38, 146, 752, 923],
                    [946, 808, 271, 363], [951, 111, 708, 5]]
    expected_dist = [[2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 0],
                     [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 2], [2, 2, 0, 0],
                     [2, 2, 2, 0], [2, 2, 2, 2]]
    assert np.alltrue(idx == expected_idx)

    assert np.allclose(dist, expected_dist)

    expected_info = 'KDTree: - leaf_size: 40 - metric: mixed - return_distance: True'
    assert kdtree.get_info() == expected_info

    assert kdtree.get_class_type() == 'data_structure'
def test_kdd_tree_mixed():
    stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0)

    X, _ = stream.next_sample(1000)
    X_test, _ = stream.next_sample(10)

    # Build tree
    cat_features = [i for i in range(25)]
    kdtree = KDTree(X, metric='mixed', return_distance=True, categorical_list=cat_features)

    # Query tree
    dist, idx = kdtree.query(X_test, 4)

    expected_idx = [[123, 234, 707, 654],
                    [688, 429, 216, 627],
                    [463, 970, 566, 399],
                    [18, 895, 640, 996],
                    [396, 612, 897, 232],
                    [328, 54, 138, 569],
                    [253, 501, 82, 273],
                    [38, 146, 752, 923],
                    [946, 808, 271, 363],
                    [951, 111, 708, 5]]
    expected_dist = [[2, 2, 2, 2],
                     [2, 2, 2, 2],
                     [2, 2, 2, 2],
                     [2, 2, 2, 0],
                     [2, 2, 2, 0],
                     [2, 2, 2, 0],
                     [2, 2, 2, 2],
                     [2, 2, 0, 0],
                     [2, 2, 2, 0],
                     [2, 2, 2, 2]]
    assert np.alltrue(idx == expected_idx)

    assert np.allclose(dist, expected_dist)

    expected_info = 'KDTree(categorical_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ' \
                    '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], ' \
                    'leaf_size=40, metric=mixed, return_distance=True)'
    assert kdtree.get_info() == expected_info

    assert kdtree._estimator_type == 'data_structure'
def test_kdd_tree_euclidean():
    stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1)

    X, _ = stream.next_sample(1000)
    X_test, _ = stream.next_sample(10)

    # Build tree
    kdtree = KDTree(X, metric='euclidean', return_distance=True)

    # Query tree
    dist, idx = kdtree.query(X_test, 4)

    expected_idx = [[855, 466, 348, 996],
                    [829, 654, 92, 333],
                    [227, 364, 183, 325],
                    [439, 482, 817, 501],
                    [886, 173, 279, 470],
                    [98, 30, 34, 580],
                    [959, 773, 374, 819],
                    [819, 685, 59, 992],
                    [624, 665, 209, 239],
                    [524, 807, 506, 191]]
    expected_dist = [[1.6366216258724973, 1.631437068636607, 1.5408182139320563, 1.4836054196064452],
                     [1.7839579422032452, 1.7694587302438618, 1.5339920309706585, 1.5228981881653287],
                     [1.6512443805072872, 1.637456923425164, 1.61736766513639, 1.5776532815820448],
                     [1.5843121606184263, 1.571918014408251, 1.5038147281265382, 0.7058569455034059],
                     [2.052148026638031, 2.0157953468214007, 1.8012794130725434, 1.6572756455115591],
                     [1.5844032729792423, 1.5688736638121885, 1.55893121879858, 1.4609657517960262],
                     [1.6819916227667229, 1.6186557774269037, 1.5815309744477162, 1.5720184136312232],
                     [1.7302164693989817, 1.5964713159009083, 1.4897849225874815, 1.1629448414734906],
                     [1.6511813695220574, 1.6454651930288255, 1.5926685577827064, 1.4973008307362947],
                     [1.5982346741983797, 1.5875900895982191, 1.4702209684850878, 1.4676217546305874]]

    assert np.alltrue(idx == expected_idx)

    assert np.allclose(dist, expected_dist)

    expected_info = 'KDTree(categorical_list=None, leaf_size=40, metric=euclidean, return_distance=True)'
    assert kdtree.get_info() == expected_info

    assert kdtree._estimator_type == 'data_structure'
Beispiel #4
0
def demo():
    """ _test_kdtree_compare
    
    This demo compares creation and query speed for different kd tree 
    implementations. They are fed with instances from the covtype dataset. 
    
    Three kd tree implementations are compared: SciPy's KDTree, NumPy's 
    KDTree and scikit-multiflow's KDTree. For each of them the demo will 
    time the construction of the tree on 1000 instances, and then measure 
    the time to query 100 instances. The results are displayed in the 
    terminal.
    
    """
    warnings.filterwarnings("ignore", ".*Passing 1d.*")

    stream = FileStream('../data/datasets/covtype.csv', -1, 1)

    filter = OneHotToCategorical([[10, 11, 12, 13],
                                  [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                   34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    X, y = stream.next_sample(1000)
    X = filter.transform(X)
    # print(X)

    X_find, y = stream.next_sample(100)
    X_find = filter.transform(X_find)
    print(X_find[4])
    # Normal kdtree
    start = timer()
    scipy = spatial.KDTree(X, leafsize=40)
    end = timer()
    print("\nScipy KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(10):
        ind = scipy.query(X_find[i], 8)
        # print(ind)
    end = timer()
    print("Scipy KDTree query time: " + str(end - start))

    del scipy

    # Fast kdtree
    start = timer()
    opt = KDTree(X, metric='euclidean', return_distance=True)
    end = timer()
    print("\nOptimal KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = opt.query(X_find[i], 8)
        # print(ind)
        # print(dist)
    end = timer()
    print("Optimal KDTree query time: " + str(end - start))

    del opt

    # Sklearn kdtree
    start = timer()
    sk = ng.KDTree(X, metric='euclidean')
    end = timer()
    print("\nSklearn KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True)
        # print(ind)
        # print(dist)
    end = timer()
    print("Sklearn KDTree query time: " + str(end - start) + "\n")

    del sk