Esempio n. 1
0
def test_euclidean_distances():
    X = da.random.uniform(size=(100, 4), chunks=50)
    Y = da.random.uniform(size=(100, 4), chunks=50)
    a = dm.euclidean_distances(X, Y)
    b = sm.euclidean_distances(X, Y)
    assert_eq(a, b)

    x_norm_squared = (X**2).sum(axis=1).compute()[:, np.newaxis]
    a = dm.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
    b = sm.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
    assert_eq(a, b)

    y_norm_squared = (Y**2).sum(axis=1).compute()[np.newaxis, :]
    a = dm.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
    b = sm.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
    assert_eq(a, b)
Esempio n. 2
0
def test_euclidean_distances_same():
    X = da.random.uniform(size=(100, 4), chunks=50)
    a = dm.euclidean_distances(X, X)
    b = sm.euclidean_distances(X, X)
    assert_eq(a, b, atol=1e-4)

    x_norm_squared = (X**2).sum(axis=1).compute()[:, np.newaxis]
    assert_eq(X, X, Y_norm_squared=x_norm_squared, atol=1e-4)
Esempio n. 3
0
from dask.distributed import Client

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Dask chunks distance matrix cpu benchmark")
    parser.add_argument("--file", type=str, help="file to benchmark")
    parser.add_argument("--dataset",
                        type=str,
                        help="dataset within file to benchmark")
    parser.add_argument("--trials",
                        type=int,
                        help="number of benchmark trials")
    args = parser.parse_args()

    client = Client(scheduler_file=os.path.join(os.getcwd(), "scheduler.json"))

    print("Loading data... {}[{}]".format(args.file, args.dataset), end="")
    workers = len(client.scheduler_info()["workers"])
    with h5py.File(args.file, "r") as handle:
        ch = handle[args.dataset].shape[0] // workers
        data = da.from_array(handle[args.dataset], chunks=(ch, -1)).persist()
    print("\t[OK]")

    for trial in range(args.trials):
        print("Trial {}...".format(trial), end="")
        start = time.perf_counter()
        dist = dmm.euclidean_distances(data, data).compute()
        end = time.perf_counter()
        print("\t{}s".format(end - start))