Beispiel #1
0
def test_qr():
    N = 28
    shard_size = 7
    shard_sizes = (shard_size, shard_size)
    X = np.random.randn(N, N)
    X_sharded = BigMatrix("QR_input_X",
                          shape=X.shape,
                          shard_sizes=shard_sizes,
                          write_header=True)
    N_blocks = X_sharded.num_blocks(0)
    shard_matrix(X_sharded, X)
    program, meta = qr(X_sharded)
    executor = fs.ProcessPoolExecutor(2)
    program.start()
    print("starting program...")
    future = executor.submit(job_runner.lambdapack_run,
                             program,
                             timeout=60,
                             idle_timeout=6,
                             pipeline_width=1)
    future = executor.submit(job_runner.lambdapack_run,
                             program,
                             timeout=60,
                             idle_timeout=6,
                             pipeline_width=1)
    future = executor.submit(job_runner.lambdapack_run,
                             program,
                             timeout=60,
                             idle_timeout=6,
                             pipeline_width=1)
    program.wait()
    program.free()
    Rs = meta["outputs"][0]
    R_remote = Rs.get_block(N_blocks - 1, N_blocks - 1, 0)
    R_local = np.linalg.qr(X)[1][-shard_size:, -shard_size:]
    sign_matrix_local = np.eye(R_local.shape[0])
    sign_matrix_remote = np.eye(R_local.shape[0])
    sign_matrix_local[np.where(np.diag(R_local) <= 0)] *= -1
    sign_matrix_remote[np.where(np.diag(R_remote) <= 0)] *= -1
    # make the signs match
    R_remote *= np.diag(sign_matrix_remote)[:, np.newaxis]
    R_local *= np.diag(sign_matrix_local)[:, np.newaxis]
    assert (np.allclose(R_local, R_remote))
def test_cholesky_lambda():
    X = np.random.randn(64, 64)
    A = X.dot(X.T) + np.eye(X.shape[0])
    shard_size = 16
    shard_sizes = (shard_size, shard_size)
    A_sharded = BigMatrix("cholesky_test_A", shape=A.shape,
                          shard_sizes=shard_sizes, write_header=True)
    A_sharded.free()
    shard_matrix(A_sharded, A)
    program, meta = cholesky(A_sharded)
    futures = run_program_in_pywren(program)
    program.start()
    program.wait()
    program.free()
    L_sharded = meta["outputs"][0]
    L_npw = L_sharded.numpy()
    L = np.linalg.cholesky(A)
    assert(np.allclose(L_npw, L))
    print("great success!")
Beispiel #3
0
 def test_elemwise_uop(self, f, f_numpy):
     X = np.random.randn(16, 16)
     pwex = pywren.default_executor()
     X_sharded = BigMatrix("{0}_uop_test".format(f),
                           shape=X.shape,
                           shard_sizes=X.shape)
     shard_matrix(X_sharded, X)
     res_sharded = f(pwex, X_sharded)
     res = res_sharded.numpy()
     res.free()
     assert (np.isclose(f_numpy(X), res))
Beispiel #4
0
 def test_multiple_shard_matrix_multiply_symmetric_2(self):
     X = np.random.randn(16, 16)
     shard_sizes = [8, 16]
     X_sharded = BigMatrix("gemm_test_1",
                           shape=X.shape,
                           shard_sizes=shard_sizes)
     shard_matrix(X_sharded, X)
     pwex = pywren.lambda_executor()
     XTX_sharded = binops.gemm(pwex,
                               X_sharded.T,
                               X_sharded,
                               X_sharded.bucket,
                               1,
                               local=True)
     XTX_sharded_local = XTX_sharded.numpy()
     XTX = X.T.dot(X)
     X_sharded.free()
     XTX_sharded.free()
     assert (np.all(np.isclose(XTX, XTX_sharded_local)))
     os.system("rm -rf /dev/shm/*")
Beispiel #5
0
 def test_single_shard_gemv(self):
     X = np.random.randn(16, 16)
     Y = np.random.randn(16)
     X_sharded = BigMatrix("gemv_test_0",
                           shape=X.shape,
                           shard_sizes=X.shape)
     Y_sharded = BigMatrix("gemv_test_2",
                           shape=Y.shape,
                           shard_sizes=Y.shape)
     shard_matrix(X_sharded, X)
     pwex = pywren.default_executor()
     XY_sharded = binops.gemv(pwex, X_sharded, Y_sharded, X_sharded.bucket,
                              1)
     XY_sharded_local = XY_sharded.numpy()
     XY = X.dot(Y)
     print(XY)
     print(XY_sharded_local)
     X_sharded.free()
     XY_sharded.free()
     assert (np.all(np.isclose(XY, XY_sharded_local)))
Beispiel #6
0
    def test_single_shard_matrix_multiply(self):
        fexec = lithops.FunctionExecutor(runtime='jsampe/numpy-lithops:04',
                                         log_level='DEBUG')

        X = np.random.randn(16, 16)
        X_sharded = BigMatrix("gemm_test_0",
                              shape=X.shape,
                              shard_sizes=X.shape,
                              storage=fexec.storage)
        shard_matrix(X_sharded, X)

        XX_sharded = binops.gemm(fexec, X_sharded, X_sharded.T,
                                 X_sharded.bucket, 1)

        XX_sharded_local = XX_sharded.numpy()
        XX = X.dot(X.T)
        X_sharded.free()
        XX_sharded.free()

        assert (np.all(np.isclose(XX, XX_sharded_local)))
        os.system("rm -rf /dev/shm/*")
Beispiel #7
0
 def test_nested_if_run(self):
     X = np.random.randn(64)
     shard_sizes = (int(X.shape[0] / 8), )
     X_sharded = BigMatrix("if_test",
                           shape=X.shape,
                           shard_sizes=shard_sizes,
                           write_header=True)
     O_sharded = BigMatrix("if_test_output",
                           shape=X.shape,
                           shard_sizes=shard_sizes,
                           write_header=True)
     X_sharded.free()
     shard_matrix(X_sharded, X)
     f = frontend.lpcompile(f1_if_nested)
     p = f(X_sharded, O_sharded, X_sharded.num_blocks(0))
     num_cores = 1
     executor = fs.ProcessPoolExecutor(num_cores)
     config = npw.config.default()
     p_ex = lp.LambdaPackProgram(p, config=config)
     p_ex.start()
     all_futures = []
     for i in range(num_cores):
         all_futures.append(
             executor.submit(job_runner.lambdapack_run,
                             p_ex,
                             pipeline_width=1,
                             idle_timeout=5,
                             timeout=60))
     p_ex.wait()
     time.sleep(5)
     p_ex.free()
     for i in range(X_sharded.num_blocks(0)):
         Ob = O_sharded.get_block(i)
         Xb = X_sharded.get_block(i)
         if ((i % 2) == 0 and ((i % 3) == 0)):
             assert (np.allclose(Ob, 3 * Xb))
         elif ((i % 2) == 0):
             assert (np.allclose(Ob, Xb))
         else:
             assert (np.allclose(Ob, 2 * Xb))
def test_cholesky():
    X = np.random.randn(64, 64)
    A = X.dot(X.T) + np.eye(X.shape[0])
    shard_size = 8
    shard_sizes = (shard_size, shard_size)
    A_sharded = BigMatrix("cholesky_test_A", shape=A.shape,
                          shard_sizes=shard_sizes, write_header=True)
    A_sharded.free()
    shard_matrix(A_sharded, A)
    program, meta = cholesky(A_sharded)
    executor = fs.ProcessPoolExecutor(1)
    print("starting program")
    program.start()
    future = executor.submit(job_runner.lambdapack_run,
                             program, timeout=30, idle_timeout=6)
    program.wait()
    program.free()
    L_sharded = meta["outputs"][0]
    L_npw = L_sharded.numpy()
    L = np.linalg.cholesky(A)
    assert(np.allclose(L_npw, L))
    print("great success!")
def test_gemm_lambda():
    size = 32
    A = np.random.randn(size, size)
    B = np.random.randn(size, size)
    C = np.dot(A, B)
    shard_sizes = (8, 8)
    A_sharded = BigMatrix("Gemm_test_A", shape=A.shape,
                          shard_sizes=shard_sizes, write_header=True)
    B_sharded = BigMatrix("Gemm_test_B", shape=A.shape,
                          shard_sizes=shard_sizes, write_header=True)
    shard_matrix(A_sharded, A)
    shard_matrix(B_sharded, B)
    program, meta = gemm(A_sharded, B_sharded)
    executor = fs.ProcessPoolExecutor(1)
    program.start()
    run_program_in_pywren(program)
    program.wait()
    program.free()
    C_sharded = meta["outputs"][0]
    C_npw = C_sharded.numpy()
    assert(np.allclose(C_npw, C))
    return
def test_bdfac_truncated():
    N = 16
    shard_size = 4
    shard_sizes = (shard_size, shard_size)
    np.random.seed(0)
    X = np.random.randn(N, N)
    U, S, V = bdfac_python(X, block_size=shard_size)
    svd_bdfac = np.linalg.svd(S, compute_uv=False)
    svd_local = np.linalg.svd(X, compute_uv=False)
    print(svd_bdfac)
    print(svd_local)
    assert(np.allclose(svd_bdfac, svd_local))
    X_sharded = BigMatrix("BDFAC_input_X", shape=X.shape,
                          shard_sizes=shard_sizes, write_header=True)
    N_blocks = X_sharded.num_blocks(0)
    shard_matrix(X_sharded, X)
    program, meta = bdfac(X_sharded, truncate=2)
    executor = fs.ProcessPoolExecutor(1)
    program.start()
    executor.submit(job_runner.lambdapack_run, program,
                    timeout=200, idle_timeout=200, pipeline_width=1)
    program.wait()
    print("returned..")
def test_gemm():
    size = 64
    # np.random.seed(0)
    A = np.random.randn(size, size)
    B = np.random.randn(size, size)
    C = np.dot(A, B)
    shard_sizes = (16, 16)
    A_sharded = BigMatrix("Gemm_test_A", shape=A.shape,
                          shard_sizes=shard_sizes, write_header=True)
    B_sharded = BigMatrix("Gemm_test_B", shape=A.shape,
                          shard_sizes=shard_sizes, write_header=True)
    shard_matrix(A_sharded, A)
    shard_matrix(B_sharded, B)
    program, meta = gemm(A_sharded, B_sharded)
    program.start()
    job_runner.lambdapack_run(
        program, timeout=60, idle_timeout=6, pipeline_width=3)
    program.wait()
    program.free()
    C_sharded = meta["outputs"][0]
    C_npw = C_sharded.numpy()
    assert(np.allclose(C_npw, C))
    return
Beispiel #12
0
def cholesky(X, truncate=0):
    S = BigMatrix("Cholesky.Intermediate({0})".format(X.key),
                  shape=(X.num_blocks(1) + 1, X.shape[0], X.shape[0]),
                  shard_sizes=(1, X.shard_sizes[0], X.shard_sizes[0]),
                  bucket=X.bucket,
                  write_header=True)
    O = BigMatrix("Cholesky({0})".format(X.key),
                  shape=(X.shape[0], X.shape[0]),
                  shard_sizes=(X.shard_sizes[0], X.shard_sizes[0]),
                  write_header=True,
                  parent_fn=constant_zeros)
    t = time.time()
    p0 = lpcompile_for_execution(CHOLESKY, inputs=["I"], outputs=["O"])
    p1 = p0(O, X, S, int(np.ceil(X.shape[0] / X.shard_sizes[0])), truncate)
    e = time.time()
    c_time = e - t
    config = npw.config.default()
    program = lp.LambdaPackProgram(p1, config=config)
    return program, {
        "outputs": [O],
        "intermediates": [S],
        "compile_time": c_time
    }
Beispiel #13
0
def tsqr(X, truncate=0):
    b_fac = 2
    assert (X.shard_sizes[1] == X.shape[1])
    shard_size = X.shard_sizes[0]
    shard_sizes = X.shard_sizes
    num_tree_levels = max(
        int(np.ceil(np.log2(X.num_blocks(0)) / np.log2(b_fac))), 1)
    R_sharded = BigMatrix("tsqr_R({0})".format(X.key),
                          shape=(num_tree_levels * shard_size, X.shape[0]),
                          shard_sizes=shard_sizes,
                          write_header=True,
                          safe=False)
    T_sharded = BigMatrix("tsqr_T({0})".format(X.key),
                          shape=(num_tree_levels * shard_size * b_fac,
                                 X.shape[0]),
                          shard_sizes=(shard_size * b_fac, shard_size),
                          write_header=True,
                          safe=False)
    V_sharded = BigMatrix("tsqr_V({0})".format(X.key),
                          shape=(num_tree_levels * shard_size * b_fac,
                                 X.shape[0]),
                          shard_sizes=(shard_size * b_fac, shard_size),
                          write_header=True,
                          safe=False)
    t = time.time()
    p0 = lpcompile_for_execution(TSQR, inputs=["A"], outputs=["Rs"])
    config = npw.config.default()
    N_blocks = X.num_blocks(0)
    p1 = p0(X, V_sharded, T_sharded, R_sharded, N_blocks)
    e = time.time()
    c_time = e - t
    program = lp.LambdaPackProgram(p1, config=config)
    return program, {
        "outputs": [R_sharded, V_sharded, T_sharded],
        "intermediates": [],
        "compile_time": c_time
    }
Beispiel #14
0
def test_cholesky_timeouts():
    X = np.random.randn(64, 64)
    A = X.dot(X.T) + np.eye(X.shape[0])
    shard_size = 8
    shard_sizes = (shard_size, shard_size)
    A_sharded = BigMatrix("job_runner_test",
                          shape=A.shape,
                          shard_sizes=shard_sizes,
                          write_header=True)
    A_sharded.free()
    shard_matrix(A_sharded, A)
    program, meta = cholesky(A_sharded)
    executor = fs.ProcessPoolExecutor(1)
    print("starting program")
    program.start()
    future = executor.submit(job_runner.lambdapack_run,
                             program,
                             timeout=10,
                             idle_timeout=6)
    time.sleep(15)
    print("poop")
    assert (int(program.get_up()) == 0)
    program.free()
    print("great success!")
 def test_simple_slices(self):
     X = np.random.randn(128, 128)
     shard_sizes = [32, 32]
     X_sharded = BigMatrix("test_3", shape=X.shape, shard_sizes=shard_sizes)
     shard_matrix(X_sharded, X)
     assert(np.all(X[0:64] == X_sharded.submatrix([2]).numpy()))
     assert(np.all(X[64:128] == X_sharded.submatrix([2, None]).numpy()))
     assert(np.all(X[:, 0:96] == X_sharded.submatrix(None, [0, 3]).numpy()))
     assert(np.all(X[:, 96:128] == X_sharded.submatrix(
         None, [3, None]).numpy()))
 def test_multiple_shard_index_get(self):
     X = np.random.randn(128, 128)
     shard_sizes = [64, 64]
     X_sharded = BigMatrix("test_2", shape=X.shape, shard_sizes=shard_sizes)
     shard_matrix(X_sharded, X)
     assert (np.all(X[0:64, 0:64] == X_sharded.submatrix(0).get_block(0)))
     assert (np.all(X[64:128,
                      64:128] == X_sharded.submatrix(1, 1).get_block()))
     assert (np.all(X[0:64,
                      64:128] == X_sharded.submatrix(0, 1).get_block()))
     assert (np.all(X[64:128,
                      0:64] == X_sharded.submatrix(None, 0).get_block(1)))
 def test_step_slices(self):
     X = np.random.randn(128, 128)
     shard_sizes = [16, 16]
     X_sharded = BigMatrix("test_4", shape=X.shape, shard_sizes=shard_sizes)
     shard_matrix(X_sharded, X)
     assert (np.all(
         X[::32] == X_sharded.submatrix([None, None, 2]).numpy()[::16]))
     assert (np.all(
         X[16::32] == X_sharded.submatrix([1, None, 2]).numpy()[::16]))
     assert (np.all(X[:, 0:96:64] == X_sharded.submatrix(
         None, [0, 6, 4]).numpy()[:, ::16]))
     assert (np.all(X[:, 96:128:64] == X_sharded.submatrix(
         None, [6, 8, 4]).numpy()[:, ::16]))
Beispiel #18
0
 def test_multiple_shard_matrix_multiply(self):
     X = np.random.randn(16, 16)
     Y = np.random.randn(16, 16)
     shard_sizes = tuple(map(int, np.array(X.shape) / 2))
     X_sharded = BigMatrix("gemm_test_1",
                           shape=X.shape,
                           shard_sizes=shard_sizes)
     Y_sharded = BigMatrix("gemm_test_2",
                           shape=X.shape,
                           shard_sizes=shard_sizes)
     shard_matrix(X_sharded, X)
     shard_matrix(Y_sharded, Y)
     pwex = pywren.lambda_executor()
     XY_sharded = binops.gemm(pwex, X_sharded, Y_sharded, X_sharded.bucket,
                              1)
     XY_sharded_local = XY_sharded.numpy()
     XY = X.dot(Y)
     X_sharded.free()
     Y_sharded.free()
     XY_sharded.free()
     assert (np.all(np.isclose(XY, XY_sharded_local)))
     os.system("rm -rf /dev/shm/*")
Beispiel #19
0
 def test_multiple_shard_matrix_gemv(self):
     X = np.random.randn(16, 16)
     Y = np.random.randn(16, 1)
     shard_sizes_0 = tuple(map(int, np.array(X.shape) / 2))
     shard_sizes_1 = (Y.shape[0], 1)
     X_sharded = BigMatrix("gemv_test_1",
                           shape=X.shape,
                           shard_sizes=shard_sizes_0)
     Y_sharded = BigMatrix("gemv_test_2",
                           shape=Y.shape,
                           shard_sizes=shard_sizes_1)
     shard_matrix(X_sharded, X)
     shard_matrix(Y_sharded, Y)
     pwex = pywren.default_executor()
     XY_sharded = binops.gemv(pwex, X_sharded, Y_sharded, X_sharded.bucket,
                              1)
     XY_sharded_local = XY_sharded.numpy()
     XY = X.dot(Y)
     X_sharded.free()
     Y_sharded.free()
     XY_sharded.free()
     assert (np.all(np.isclose(XY, XY_sharded_local)))
Beispiel #20
0
 def test_multiple_shard_cholesky(self):
     np.random.seed(1)
     size = 128
     shard_size = 64
     np.random.seed(1)
     print("Generating X")
     executor = fs.ProcessPoolExecutor(cpu_count)
     X = np.random.randn(size, 128)
     print("Generating A")
     A = X.dot(X.T) + np.eye(X.shape[0])
     y = np.random.randn(size)
     pwex = pywren.default_executor()
     print("sharding A")
     shard_sizes = (shard_size, shard_size)
     A_sharded = BigSymmetricMatrix("cholesky_test_A",
                                    shape=A.shape,
                                    shard_sizes=shard_sizes)
     y_sharded = BigMatrix("cholesky_test_y",
                           shape=y.shape,
                           shard_sizes=shard_sizes[:1])
     A_sharded.free()
     y_sharded.free()
     A_sharded = BigSymmetricMatrix("cholesky_test_A",
                                    shape=A.shape,
                                    shard_sizes=shard_sizes)
     y_sharded = BigMatrix("cholesky_test_y",
                           shape=y.shape,
                           shard_sizes=shard_sizes[:1])
     t = time.time()
     shard_matrix(A_sharded, A, executor=executor)
     e = time.time()
     print("A_sharded", e - t)
     t = time.time()
     shard_matrix(y_sharded, y, executor=executor)
     e = time.time()
     print("y_sharded time", e - t)
     print("Computing LL^{T}")
     L = cholesky(A)
     print(L)
     L_sharded = uops.chol(pwex, A_sharded)
     L_sharded_local = L_sharded.numpy()
     print(L_sharded_local)
     print(L)
     print("L_{infty} difference ", np.max(np.abs(L_sharded_local - L)))
     assert (np.allclose(L, L_sharded_local))
     os.system("rm -rf /dev/shm/*")
Beispiel #21
0
 def test_if_static(self):
     X = np.random.randn(64, 64)
     shard_sizes = (int(X.shape[0]/8), X.shape[1])
     X_sharded = BigMatrix("if_test", shape=X.shape,
                           shard_sizes=shard_sizes, write_header=True)
     O_sharded = BigMatrix("if_test_output", shape=X.shape,
                           shard_sizes=shard_sizes, write_header=True)
     X_sharded.free()
     shard_matrix(X_sharded, X)
     f = frontend.lpcompile(f1_if)
     p = f(X_sharded, O_sharded, X_sharded.num_blocks(0))
     assert(p.starters == p.find_terminators())
     for s, var_values in p.starters:
         if(var_values['i'] % 2 == 0):
             assert s == 0
         else:
             assert s == 1
Beispiel #22
0
def test_tsqr():
    np.random.seed(1)
    size = 256
    shard_size = 32
    X = np.random.randn(size, shard_size)
    Q, R = np.linalg.qr(X)
    q0, r0 = np.linalg.qr(X[:2, :2])
    q1, r1 = np.linalg.qr(X[2:, :2])
    r2 = np.linalg.qr(np.vstack((r0, r1)))[1]
    shard_sizes = (shard_size, X.shape[1])
    X_sharded = BigMatrix("tsqr_test_X",
                          shape=X.shape,
                          shard_sizes=shard_sizes,
                          write_header=True)
    shard_matrix(X_sharded, X)
    program, meta = tsqr(X_sharded)
    executor = fs.ProcessPoolExecutor(1)
    print("starting program")
    program.start()
    future = executor.submit(job_runner.lambdapack_run,
                             program,
                             timeout=10,
                             idle_timeout=6)
    program.wait()
    program.free()
    R_sharded = meta["outputs"][0]
    num_tree_levels = int(np.log(np.ceil(size / shard_size)) / np.log(2))
    print("num_tree_levels", num_tree_levels)
    R_npw = R_sharded.get_block(max(num_tree_levels, 0), 0)
    sign_matrix_local = np.eye(R.shape[0])
    sign_matrix_remote = np.eye(R.shape[0])
    sign_matrix_local[np.where(np.diag(R) <= 0)] *= -1
    sign_matrix_remote[np.where(np.diag(R_npw) <= 0)] *= -1
    # make the signs match
    R_npw *= np.diag(sign_matrix_remote)[:, np.newaxis]
    R *= np.diag(sign_matrix_local)[:, np.newaxis]
    assert (np.allclose(R_npw, R))
 def test_single_shard_index_get(self):
     X = np.random.randn(128, 128)
     X_sharded = BigMatrix("test_0", shape=X.shape, shard_sizes=X.shape)
     shard_matrix(X_sharded, X)
     X_sharded_local = X_sharded.submatrix(0, 0).get_block()
     assert (np.all(X_sharded_local == X))
Beispiel #24
0
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru,
                   eager, truncate, max_cores, start_cores, trial,
                   launch_granularity, timeout, log_granularity,
                   autoscale_policy, standalone, warmup, verify, matrix_exists,
                   read_limit, write_limit, compute_threads_per_worker):
    # set up logging
    invoke_executor = fs.ThreadPoolExecutor(1)
    logger = logging.getLogger()
    region = npw.config.default()["account"]["aws_region"]

    print("REGION", region)
    for key in logging.Logger.manager.loggerDict:
        logging.getLogger(key).setLevel(logging.CRITICAL)
    logger.setLevel(logging.DEBUG)
    arg_bytes = pickle.dumps(
        (problem_size, shard_size, pipeline, num_priorities, lru, eager,
         truncate, max_cores, start_cores, trial, launch_granularity, timeout,
         log_granularity, autoscale_policy, read_limit, write_limit))
    arg_hash = hashlib.md5(arg_bytes).hexdigest()
    log_file = "{0}.log".format(arg_hash)
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    logger.info("Logging to {0}".format(log_file))
    if standalone:
        extra_env = {
            "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"].strip(),
            "AWS_SECRET_ACCESS_KEY":
            os.environ["AWS_SECRET_ACCESS_KEY"].strip(),
            "OMP_NUM_THREADS": "1",
            "AWS_DEFAULT_REGION": region
        }
        config = npw.config.default()
        pwex = lithops.FunctionExecutor()
    else:
        extra_env = {"AWS_DEFAULT_REGION": region}
        config = npw.config.default()
        pwex = lithops.FunctionExecutor()

    if (not matrix_exists):
        X = np.random.randn(problem_size, 1)
        shard_sizes = [shard_size, 1]
        X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size,
                                                       shard_size),
                              shape=X.shape,
                              shard_sizes=shard_sizes,
                              write_header=True,
                              autosqueeze=False,
                              bucket=config['s3']['bucket'])
        shard_matrix(X_sharded, X)
        print("Generating PSD matrix...")
        t = time.time()
        print(X_sharded.shape)
        XXT_sharded = binops.gemm(pwex,
                                  X_sharded,
                                  X_sharded.T,
                                  overwrite=False)
        e = time.time()
        print("GEMM took {0}".format(e - t))
    else:
        X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size,
                                                       shard_size),
                              autosqueeze=False,
                              bucket="numpywrennsdi")
        key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T,
                                                  "gemm")
        XXT_sharded = BigMatrix(key_name,
                                hash_keys=False,
                                bucket=config['s3']['bucket'])
    XXT_sharded.lambdav = problem_size * 10
    t = time.time()
    program, meta = bdfac(XXT_sharded, truncate=truncate)
    pipeline_width = args.pipeline
    if (lru):
        cache_size = 5
    else:
        cache_size = 0
    pywren_config = pwex.config
    e = time.time()
    print("Program compile took {0} seconds".format(e - t))
    print("program.hash", program.hash)
    REDIS_CLIENT = program.control_plane.client
    done_counts = []
    ready_counts = []
    post_op_counts = []
    not_ready_counts = []
    running_counts = []
    sqs_invis_counts = []
    sqs_vis_counts = []
    up_workers_counts = []
    busy_workers_counts = []
    read_objects = []
    write_objects = []
    all_read_timeouts = []
    all_write_timeouts = []
    all_redis_timeouts = []
    times = [time.time()]
    flops = [0]
    reads = [0]
    writes = [0]
    print("LRU", lru)
    print("eager", eager)
    exp = {}
    exp["redis_done_counts"] = done_counts
    exp["redis_ready_counts"] = ready_counts
    exp["redis_post_op_counts"] = post_op_counts
    exp["redis_not_ready_counts"] = not_ready_counts
    exp["redis_running_counts"] = running_counts
    exp["sqs_invis_counts"] = sqs_invis_counts
    exp["sqs_vis_counts"] = sqs_vis_counts
    exp["busy_workers"] = busy_workers_counts
    exp["up_workers"] = up_workers_counts
    exp["times"] = times
    exp["lru"] = lru
    exp["priority"] = num_priorities
    exp["eager"] = eager
    exp["truncate"] = truncate
    exp["max_cores"] = max_cores
    exp["problem_size"] = problem_size
    exp["shard_size"] = shard_size
    exp["pipeline"] = pipeline
    exp["flops"] = flops
    exp["reads"] = reads
    exp["writes"] = writes
    exp["read_objects"] = read_objects
    exp["write_objects"] = write_objects
    exp["read_timeouts"] = all_read_timeouts
    exp["write_timeouts"] = all_write_timeouts
    exp["redis_timeouts"] = all_redis_timeouts
    exp["trial"] = trial
    exp["launch_granularity"] = launch_granularity
    exp["log_granularity"] = log_granularity
    exp["autoscale_policy"] = autoscale_policy
    exp["standalone"] = standalone
    exp["program"] = program
    exp["time_steps"] = 1
    exp["failed"] = False

    program.start()
    t = time.time()
    logger.info("Starting with {0} cores".format(start_cores))
    all_futures = pwex.map(
        lambda x: job_runner.lambdapack_run(program,
                                            pipeline_width=pipeline_width,
                                            cache_size=cache_size,
                                            timeout=timeout),
        range(start_cores),
        extra_env=extra_env)
    start_time = time.time()
    last_run_time = start_time
    print(program.program_status())
    print("QUEUE URLS", len(program.queue_urls))
    total_lambda_epochs = start_cores
    try:
        while (program.program_status() == lp.PS.RUNNING):
            time.sleep(log_granularity)
            curr_time = int(time.time() - start_time)
            p = program.get_progress()
            if (p is None):
                print("no progress...")
                continue
            else:
                p = int(p)
            times.append(int(time.time()))
            max_pc = p
            waiting = 0
            running = 0
            for i, queue_url in enumerate(program.queue_urls):
                client = boto3.client('sqs')
                attrs = client.get_queue_attributes(
                    QueueUrl=queue_url,
                    AttributeNames=[
                        'ApproximateNumberOfMessages',
                        'ApproximateNumberOfMessagesNotVisible'
                    ])['Attributes']
                waiting += int(attrs["ApproximateNumberOfMessages"])
                running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
            sqs_invis_counts.append(running)
            sqs_vis_counts.append(waiting)
            busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash))

            repeated_compute = parse_int(
                REDIS_CLIENT.get("{0}_repeated_compute".format(program.hash)))
            repeated_post_op = parse_int(
                REDIS_CLIENT.get("{0}_repeated_post_op".format(program.hash)))
            repeated_finish = parse_int(
                REDIS_CLIENT.get("{0}_repeated_finish".format(program.hash)))
            not_ready = parse_int(
                REDIS_CLIENT.get("{0}_not_ready".format(program.hash)))
            if (busy_workers == None):
                busy_workers = 0
            else:
                busy_workers = int(busy_workers)
            up_workers = program.get_up()

            if (up_workers == None):
                up_workers = 0
            else:
                up_workers = int(up_workers)
            up_workers_counts.append(up_workers)
            busy_workers_counts.append(busy_workers)

            logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))
            if ((curr_time % INFO_FREQ) == 0):
                logger.info("Waiting: {0}, Currently Processing: {1}".format(
                    waiting, running))
                logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                    up_workers, busy_workers, curr_time))

            current_gflops = program.get_flops()
            if (current_gflops is None):
                current_gflops = 0
            else:
                current_gflops = int(current_gflops) / 1e9

            flops.append(current_gflops)
            current_gbytes_read = program.get_read()
            if (current_gbytes_read is None):
                current_gbytes_read = 0
            else:
                current_gbytes_read = int(current_gbytes_read) / 1e9

            reads.append(current_gbytes_read)
            current_gbytes_write = program.get_write()
            if (current_gbytes_write is None):
                current_gbytes_write = 0
            else:
                current_gbytes_write = int(current_gbytes_write) / 1e9
            writes.append(current_gbytes_write)

            gflops_rate = flops[-1] / (times[-1] - times[0])
            greads_rate = reads[-1] / (times[-1] - times[0])
            gwrites_rate = writes[-1] / (times[-1] - times[0])
            b = XXT_sharded.shard_sizes[0]
            current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8)
            current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8)
            read_objects.append(current_objects_read)
            write_objects.append(current_objects_write)
            read_rate = read_objects[-1] / (times[-1] - times[0])
            write_rate = write_objects[-1] / (times[-1] - times[0])

            avg_workers = np.mean(up_workers_counts)
            smooth_len = 10
            if (len(flops) > smooth_len + 5):
                gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / (
                    times[-1] - times[-smooth_len])
                gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / (
                    times[-1] - times[-smooth_len])
                gwrite_rate_5_min_window = (
                    writes[-1] - writes[-smooth_len]) / (times[-1] -
                                                         times[-smooth_len])
                read_rate_5_min_window = (read_objects[-1] -
                                          read_objects[-smooth_len]) / (
                                              times[-1] - times[-smooth_len])
                write_rate_5_min_window = (write_objects[-1] -
                                           write_objects[-smooth_len]) / (
                                               times[-1] - times[-smooth_len])
                workers_5_min_window = np.mean(up_workers_counts[-smooth_len:])
            else:
                gflops_rate_5_min_window = "N/A"
                gread_rate_5_min_window = "N/A"
                gwrite_rate_5_min_window = "N/A"
                workers_5_min_window = "N/A"
                read_rate_5_min_window = "N/A"
                write_rate_5_min_window = "N/A"

            read_timeouts = int(parse_int(
                REDIS_CLIENT.get("s3.timeouts.read")))
            write_timeouts = int(
                parse_int(REDIS_CLIENT.get("s3.timeouts.write")))
            redis_timeouts = int(parse_int(REDIS_CLIENT.get("redis.timeouts")))
            all_read_timeouts.append(read_timeouts)
            all_write_timeouts.append(write_timeouts)
            all_redis_timeouts.append(redis_timeouts)
            read_timeouts_fraction = read_timeouts / (current_objects_read +
                                                      1e-8)
            write_timeouts_fraction = write_timeouts / (current_objects_write +
                                                        1e-8)
            print("=======================================")
            print(
                f"Progress is {p}, Repeated Compute is {repeated_compute}, Repeated POST OP is {repeated_post_op}, Repeated Finishes is {repeated_finish}, Not ready Nodes scheduled are {not_ready}"
            )
            print("Max PC is {0}".format(max_pc))
            print("Waiting: {0}, Currently Processing: {1}".format(
                waiting, running))
            print("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))
            print(
                "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}"
                .format(curr_time, current_gflops, current_gbytes_read,
                        current_gbytes_write))
            print(
                "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write  rate {3}, Average Worker Count {4}"
                .format(curr_time, gflops_rate, greads_rate, gwrites_rate,
                        avg_workers))
            print("{0}: Average read txns/s {1}, Average write txns/s {2}".
                  format(curr_time, read_rate, write_rate))
            print(
                "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write  rate {3}, smoothed Worker Count {4}"
                .format(curr_time, gflops_rate_5_min_window,
                        gread_rate_5_min_window, gwrite_rate_5_min_window,
                        workers_5_min_window))
            print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}".
                  format(curr_time, read_rate_5_min_window,
                         write_rate_5_min_window))
            print(
                "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3}  "
                .format(curr_time, read_timeouts, write_timeouts,
                        redis_timeouts))
            print(
                "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}"
                .format(curr_time, read_timeouts_fraction,
                        write_timeouts_fraction))
            print("=======================================")

            time_since_launch = time.time() - last_run_time
            if (time_since_launch > (0.85 * timeout)):
                cores_to_launch = max_cores
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                new_futures = pwex.map(lambda x: job_runner.lambdapack_run(
                    program,
                    pipeline_width=pipeline_width,
                    cache_size=cache_size,
                    timeout=timeout),
                                       range(cores_to_launch),
                                       extra_env=extra_env)
                # print("waiting for second result")
                # print("result..", new_futures[0].result())
                # print([x.result() for x in new_futures])

                last_run_time = time.time()
                all_futures.extend(new_futures)
            exp["time_steps"] += 1
    except KeyboardInterrupt:
        exp["failed"] = True
        program.stop()
        pass
    except Exception as e:
        traceback.print_exc()
        exp["failed"] = True
        program.stop()
        raise
        pass
    print(program.program_status())
    exp["all_futures"] = all_futures
    exp_bytes = dill.dumps(exp)
    client = boto3.client('s3')
    client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash),
                      Body=exp_bytes,
                      Bucket=program.bucket)
    print("=======================")
    print("=======================")
    print("Execution Summary:")
    print("Executed Program ID: {0}".format(program.hash))
    print("Program Success: {0}".format((not exp["failed"])))
    print("Problem Size: {0}".format(exp["problem_size"]))
    print("Shard Size: {0}".format(exp["shard_size"]))
    print("Total Execution time: {0}".format(times[-1] - times[0]))
    print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] /
                                                    (times[-1] - times[0])))
    with open("/tmp/last_run", "w+") as f:
        f.write(program.hash)
 def test_single_shard_index_put(self):
     X = np.random.randn(128, 128)
     X_sharded = BigMatrix("test_1", shape=X.shape, shard_sizes=X.shape)
     X_sharded.submatrix(0, 0).put_block(X)
     assert (np.all(X_sharded.numpy() == X))
Beispiel #26
0
    def test_cholesky_multi_repeats(self):
        ''' Insert repeated instructions into PC queue avoid double increments '''

        print("RUNNING MULTI")
        np.random.seed(1)
        size = 256
        shard_size = 30
        repeats = 15
        total_repeats = 150
        np.random.seed(2)
        print("Generating X")
        X = np.random.randn(size, 128)
        print("Generating A")
        A = X.dot(X.T) + size*np.eye(X.shape[0])
        shard_sizes = (shard_size, shard_size)
        A_sharded = BigMatrix("cholesky_test_A_{0}".format(
            int(time.time())), shape=A.shape, shard_sizes=shard_sizes, write_header=True)
        A_sharded.free()
        shard_matrix(A_sharded, A)
        instructions, trailing, L_sharded = compiler._chol(A_sharded)
        all_nodes = instructions.unroll_program()
        L_sharded.free()
        pwex = pywren.default_executor()
        executor = pywren.lambda_executor
        config = npw.config.default()
        pywren_config = pwex.config
        program = lp.LambdaPackProgram(
            instructions, executor=executor, pywren_config=pywren_config, config=config, eager=True)
        print("PROGRAM HASH", program.hash)
        cores = 1
        program.start()
        jobs = []

        for c in range(cores):
            p = mp.Process(target=job_runner.lambdapack_run, args=(
                program,), kwargs={'timeout': 3600, 'pipeline_width': 5})
            jobs.append(p)
            p.start()

        np.random.seed(0)
        try:
            while(program.program_status() == lp.PS.RUNNING):
                sqs = boto3.resource(
                    'sqs', region_name=program.control_plane.region)
                time.sleep(0.5)
                waiting = 0
                running = 0
                for i, queue_url in enumerate(program.queue_urls):
                    client = boto3.client('sqs')
                    print("Priority {0}".format(i))
                    attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=[
                                                        'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes']
                    print(attrs)
                    waiting += int(attrs["ApproximateNumberOfMessages"])
                    running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
                print("SQS QUEUE STATUS Waiting {0}, Running {1}".format(
                    waiting, running))
                for i in range(repeats):
                    p = program.get_progress()
                    if (p is None):
                        continue
                    else:
                        p = int(p)
                    pc = int(np.random.choice(min(p, len(all_nodes)), 1))
                    node = all_nodes[pc]
                    queue = sqs.Queue(program.queue_urls[0])
                    total_repeats -= 1
                    if (total_repeats > 0):
                        print("Malicilously enqueueing node ",
                              pc, node, total_repeats)
                        queue.send_message(MessageBody=json.dumps(node))
                    time.sleep(1)
        # for p in jobs:
        #    p.join()
        except:
            pass

        print("Program status")
        print(program.program_status())
        for node in all_nodes:
            edge_sum = lp.get(program.control_plane.client,
                              program._node_edge_sum_key(*node))
            if (edge_sum == None):
                edge_sum = 0
            edge_sum = int(edge_sum)
            parents = program.program.get_parents(*node)
            children = program.program.get_children(*node)
            indegree = len(parents)
            node_status = program.get_node_status(*node)
            redis_str = "Node: {0}, Edge Sum: {1}, Indegree: {2}, Node Status {3}".format(
                node, edge_sum, indegree, node_status)
            if (edge_sum != indegree):
                print(redis_str)
                for p in parents:
                    p_status = program.get_node_status(*p)
                    edge_key = program._edge_key(p[0], p[1], node[0], node[1])
                    edge_value = lp.get(program.control_plane.client, edge_key)
                    child_str = "Parent Node: {0}, Parent Status: {1}, Edge Key: {2}".format(
                        p, p_status, edge_value)
                    print(child_str)
            #assert(edge_sum == indegree)
        program.free()
        L_npw = L_sharded.numpy()
        L = np.linalg.cholesky(A)
        z = np.argmax(np.abs(L - L_npw))
        assert(np.allclose(L_npw, L))
Beispiel #27
0
    def test_cholesky_multi_failures(self):
        ''' Insert repeated instructions into PC queue avoid double increments '''

        print("RUNNING MULTI")
        np.random.seed(1)
        size = 256
        shard_size = 64
        failures = 4
        np.random.seed(1)
        print("Generating X")
        X = np.random.randn(size, 128)
        print("Generating A")
        A = X.dot(X.T) + size*np.eye(X.shape[0])
        shard_sizes = (shard_size, shard_size)
        A_sharded = BigMatrix("cholesky_test_A", shape=A.shape,
                              shard_sizes=shard_sizes, write_header=True)
        A_sharded.free()
        shard_matrix(A_sharded, A)
        instructions, trailing, L_sharded = compiler._chol(A_sharded)
        pwex = pywren.default_executor()
        executor = pywren.lambda_executor
        pywren_config = pwex.config
        config = npw.config.default()
        program = lp.LambdaPackProgram(
            instructions, executor=executor, pywren_config=pywren_config, config=config, eager=False)
        cores = 16
        program.start()
        jobs = []

        for c in range(cores):
            p = mp.Process(target=job_runner.lambdapack_run, args=(
                program,), kwargs={'timeout': 3600, 'pipeline_width': 4})
            jobs.append(p)
            p.start()

        np.random.seed(0)
        while(program.program_status() == lp.PS.RUNNING):
            sqs = boto3.resource(
                'sqs', region_name=program.control_plane.region)
            waiting = 0
            running = 0
            for i, queue_url in enumerate(program.queue_urls):
                client = boto3.client('sqs')
                print("Priority {0}".format(i))
                attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=[
                                                    'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes']
                print(attrs)
                waiting += int(attrs["ApproximateNumberOfMessages"])
                running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
            print("SQS QUEUE STATUS Waiting {0}, Running {1}".format(
                waiting, running))
            time.sleep(10)
            if (np.random.random() > 0.65):
                for i in range(failures):
                    core = int(np.random.choice(cores, 1)[0])
                    print("Maliciously Killing a job!")
                    jobs[core].terminate()
                    p = mp.Process(target=job_runner.lambdapack_run, args=(
                        program,), kwargs={'timeout': 3600, 'pipeline_width': 4})
                    p.start()
                    jobs[core] = p

        for p in jobs:
            p.join()

        print("Program status")
        print(program.program_status())
        program.free()
        L_npw = L_sharded.numpy()
        L = np.linalg.cholesky(A)
        print(L_npw)
        print(L)
        print("MAX ", np.max(np.abs(L - L_npw)))
        assert(np.allclose(L_npw, L))
Beispiel #28
0
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager,
                   truncate, max_cores, start_cores, trial, launch_granularity,
                   timeout, log_granularity, autoscale_policy,
                   failure_percentage, max_failure_events, failure_time):
    # set up logging
    logger = logging.getLogger()
    for key in logging.Logger.manager.loggerDict:
        logging.getLogger(key).setLevel(logging.CRITICAL)
    logger.setLevel(logging.DEBUG)
    arg_bytes = pickle.dumps(
        (problem_size, shard_size, pipeline, priority, lru, eager, truncate,
         max_cores, start_cores, trial, launch_granularity, timeout,
         log_granularity, autoscale_policy, failure_percentage,
         max_failure_events, failure_time))
    arg_hash = hashlib.md5(arg_bytes).hexdigest()
    log_file = "failure_experiments/{0}.log".format(arg_hash)
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    logger.info("Logging to {0}".format(log_file))

    X = np.random.randn(problem_size, 1)
    pwex = pywren.default_executor()
    shard_sizes = [shard_size, 1]
    X_sharded = BigMatrix("cholesky_test_{0}_{1}".format(
        problem_size, shard_size),
                          shape=X.shape,
                          shard_sizes=shard_sizes,
                          write_header=True)
    shard_matrix(X_sharded, X)
    print("Generating PSD matrix...")
    XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False)
    XXT_sharded.lambdav = problem_size * 10
    instructions, L_sharded, trailing = lp._chol(XXT_sharded)
    pipeline_width = args.pipeline
    if (priority):
        num_priorities = 5
    else:
        num_priorities = 1
    if (lru):
        cache_size = 5
    else:
        cache_size = 0

    REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR,
                                     port=REDIS_PORT,
                                     password=REDIS_PASS,
                                     db=0,
                                     socket_timeout=5)

    if (truncate is not None):
        instructions = instructions[:truncate]
    config = pwex.config

    program = lp.LambdaPackProgram(instructions,
                                   executor=pywren.lambda_executor,
                                   pywren_config=config,
                                   num_priorities=num_priorities,
                                   eager=eager)
    redis_env = {
        "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""),
        "REDIS_PASS": os.environ.get("REDIS_PASS", "")
    }

    done_counts = []
    ready_counts = []
    post_op_counts = []
    not_ready_counts = []
    running_counts = []
    sqs_invis_counts = []
    sqs_vis_counts = []
    up_workers_counts = []
    busy_workers_counts = []
    times = []
    flops = []
    reads = []
    writes = []
    failure_times = []
    exp = {}
    exp["redis_done_counts"] = done_counts
    exp["redis_ready_counts"] = ready_counts
    exp["redis_post_op_counts"] = post_op_counts
    exp["redis_not_ready_counts"] = not_ready_counts
    exp["redis_running_counts"] = running_counts
    exp["sqs_invis_counts"] = sqs_invis_counts
    exp["sqs_vis_counts"] = sqs_vis_counts
    exp["busy_workers"] = busy_workers_counts
    exp["up_workers"] = up_workers_counts
    exp["times"] = times
    exp["lru"] = lru
    exp["priority"] = priority
    exp["eager"] = eager
    exp["truncate"] = truncate
    exp["max_cores"] = max_cores
    exp["problem_size"] = problem_size
    exp["shard_size"] = shard_size
    exp["pipeline"] = pipeline
    exp["flops"] = flops
    exp["reads"] = reads
    exp["writes"] = writes
    exp["trial"] = trial
    exp["launch_granularity"] = launch_granularity
    exp["log_granularity"] = log_granularity
    exp["autoscale_policy"] = autoscale_policy
    exp["failure_times"] = failure_times

    logger.info("Longest Path: {0}".format(program.longest_path))
    program.start()
    t = time.time()
    logger.info("Starting with {0} cores".format(start_cores))
    failure_keys = [
        "{0}_failure_{1}_{2}".format(program.hash, i, 0)
        for i in range(start_cores)
    ]
    all_futures = pwex.map(lambda x: job_runner.lambdapack_run_with_failures(
        failure_keys[x],
        program,
        pipeline_width=pipeline_width,
        cache_size=cache_size,
        timeout=timeout),
                           range(start_cores),
                           extra_env=redis_env)
    start_time = time.time()
    last_run_time = start_time
    last_failure = time.time()
    num_failure_events = 0

    while (program.program_status() == lp.PS.RUNNING):
        curr_time = int(time.time() - start_time)
        max_pc = program.get_max_pc()
        times.append(int(time.time()))
        time.sleep(log_granularity)
        waiting = 0
        running = 0
        for i, queue_url in enumerate(program.queue_urls):
            client = boto3.client('sqs')
            attrs = client.get_queue_attributes(
                QueueUrl=queue_url,
                AttributeNames=[
                    'ApproximateNumberOfMessages',
                    'ApproximateNumberOfMessagesNotVisible'
                ])['Attributes']
            waiting += int(attrs["ApproximateNumberOfMessages"])
            running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
        sqs_invis_counts.append(running)
        sqs_vis_counts.append(waiting)
        busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash))
        if (busy_workers == None):
            busy_workers = 0
        else:
            busy_workers = int(busy_workers)
        up_workers = program.get_up()

        if (up_workers == None):
            up_workers = 0
        else:
            up_workers = int(up_workers)
        up_workers_counts.append(up_workers)
        busy_workers_counts.append(busy_workers)

        logger.debug("Waiting: {0}, Currently Processing: {1}".format(
            waiting, running))
        logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(
            up_workers, busy_workers, curr_time))
        if ((curr_time % INFO_FREQ) == 0):
            logger.info("Max PC is {0}".format(max_pc))
            logger.info("Waiting: {0}, Currently Processing: {1}".format(
                waiting, running))
            logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))

        #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2},  Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time))
        current_gflops = program.get_flops()
        if (current_gflops is None):
            current_gflops = 0
        else:
            current_gflops = int(current_gflops) / 1e9

        flops.append(current_gflops)
        current_gbytes_read = program.get_read()
        if (current_gbytes_read is None):
            current_gbytes_read = 0
        else:
            current_gbytes_read = int(current_gbytes_read) / 1e9

        reads.append(current_gbytes_read)
        current_gbytes_write = program.get_write()
        if (current_gbytes_write is None):
            current_gbytes_write = 0
        else:
            current_gbytes_write = int(current_gbytes_write) / 1e9
        writes.append(current_gbytes_write)
        #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write))

        time_since_launch = time.time() - last_run_time
        if (autoscale_policy == "dynamic"):
            if (time_since_launch > launch_granularity
                    and up_workers < np.ceil(waiting * 0.5 / pipeline_width)
                    and up_workers < max_cores):
                cores_to_launch = int(
                    min(
                        np.ceil(waiting / pipeline_width) - up_workers,
                        max_cores - up_workers))
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                _failure_keys = [
                    "{0}_failure_{1}_{2}".format(program.hash, i, curr_time)
                    for i in range(cores_to_launch)
                ]
                new_futures = pwex.map(
                    lambda x: job_runner.lambdapack_run_with_failures(
                        _failure_keys[x],
                        program,
                        pipeline_width=pipeline_width,
                        cache_size=cache_size,
                        timeout=timeout),
                    range(cores_to_launch),
                    extra_env=redis_env)
                last_run_time = time.time()
                # check if we OOM-erred
                # [x.result() for x in all_futures]
                all_futures.extend(new_futures)
        elif (autoscale_policy == "constant_timeout"):
            if (time_since_launch > (0.75 * timeout)):
                cores_to_launch = max_cores
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                _failure_keys = [
                    "{0}_failure_{1}_{2}".format(program.hash, i, curr_time)
                    for i in range(cores_to_launch)
                ]
                new_futures = pwex.map(
                    lambda x: job_runner.lambdapack_run_with_failures(
                        _failure_keys[x],
                        program,
                        pipeline_width=pipeline_width,
                        cache_size=cache_size,
                        timeout=timeout),
                    range(cores_to_launch),
                    extra_env=redis_env)
                last_run_time = time.time()
                failure_keys += _failure_keys
                # check if we OOM-erred
                # [x.result() for x in all_futures]
                all_futures.extend(new_futures)
        else:
            raise Exception("unknown autoscale policy")

        if ((time.time() - last_failure) > failure_time
                and num_failure_events < max_failure_events):
            logging.info("Killing some jobs")
            idxs = np.random.choice(len(failure_keys),
                                    int(failure_percentage *
                                        len(failure_keys)),
                                    replace=False)
            num_failure_events += 1
            last_failure = time.time()
            failure_times.append(last_failure)
            for i in idxs:
                logging.info("Killing: job {0}".format(i))
                REDIS_CLIENT.set(failure_keys[i], 1)

    exp["all_futures"] = all_futures
    for pc in range(program.num_inst_blocks):
        run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc))
        if (run_count is None):
            run_count = 0
        else:
            run_count = int(run_count)

        if (run_count != 1):
            logger.info("PC: {0}, Run Count: {1}".format(pc, run_count))

    e = time.time()
    logger.info(program.program_status())
    logger.info("PROGRAM STATUS " + str(program.program_status()))
    logger.info("PROGRAM HASH " + str(program.hash))
    logger.info("Took {0} seconds".format(e - t))
    exp["total_runtime"] = e - t
    exp["num_failure_events"] = num_failure_events
    # collect in
    executor = fs.ThreadPoolExecutor(72)
    futures = []
    for i in range(0, program.num_inst_blocks, 1):
        futures.append(executor.submit(program.get_profiling_info, i))
    res = fs.wait(futures)
    profiled_blocks = [f.result() for f in futures]
    serializer = serialize.SerializeIndependent()
    byte_string = serializer([profiled_blocks])[0][0]
    exp["profiled_block_pickle_bytes"] = byte_string

    read, write, total_flops, bins, instructions, runtimes = lp.perf_profile(
        profiled_blocks, num_bins=100)
    flop_rate = sum(total_flops) / max(bins)
    exp["flop_rate"] = flop_rate
    print("Average Flop rate of {0}".format(flop_rate))
    # save other stuff
    try:
        os.mkdir("failure_experiments/")
    except FileExistsError:
        pass
    exp_bytes = pickle.dumps(exp)
    dump_path = "failure_experiments/{0}.pickle".format(arg_hash)
    print("Dumping experiment pickle to {0}".format(dump_path))
    with open(dump_path, "wb+") as f:
        f.write(exp_bytes)
Beispiel #29
0
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru,
                   eager, truncate, max_cores, start_cores, trial,
                   launch_granularity, timeout, log_granularity,
                   autoscale_policy, standalone, warmup, verify, matrix_exists,
                   read_limit, write_limit):
    # set up logging
    invoke_executor = fs.ThreadPoolExecutor(1)
    logger = logging.getLogger()
    region = wc.default()["account"]["aws_region"]
    print("REGION", region)
    for key in logging.Logger.manager.loggerDict:
        logging.getLogger(key).setLevel(logging.CRITICAL)
    logger.setLevel(logging.DEBUG)
    arg_bytes = pickle.dumps(
        (problem_size, shard_size, pipeline, num_priorities, lru, eager,
         truncate, max_cores, start_cores, trial, launch_granularity, timeout,
         log_granularity, autoscale_policy, read_limit, write_limit))
    arg_hash = hashlib.md5(arg_bytes).hexdigest()
    log_file = "{0}.log".format(arg_hash)
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    logger.info("Logging to {0}".format(log_file))
    if standalone:
        extra_env = {
            "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
            "AWS_SECRET_ACCESS_KEY": os.environ["AWS_ACCESS_KEY_ID"],
            "OMP_NUM_THREADS": "1",
            "AWS_DEFAULT_REGION": region
        }
        config = wc.default()
        config['runtime']['s3_bucket'] = 'numpywrenpublic'
        key = "pywren.runtime/pywren_runtime-3.6-numpywren-standalone.tar.gz"
        config['runtime']['s3_key'] = key
        pwex = pywren.standalone_executor(config=config)
    else:
        extra_env = {"AWS_DEFAULT_REGION": region}
        config = wc.default()
        config['runtime']['s3_bucket'] = 'numpywrenpublic-us-east-1'
        key = "pywren.runtime/pywren_runtime-3.6-numpywren-08-25-2018.tar.gz"
        config['runtime']['s3_key'] = key
        pwex = pywren.default_executor(config=config)

    if (not matrix_exists):
        X = np.random.randn(problem_size, 1)
        shard_sizes = [shard_size, 1]
        X_sharded = BigMatrix("cholesky_test_{0}_{1}".format(
            problem_size, shard_size),
                              shape=X.shape,
                              shard_sizes=shard_sizes,
                              write_header=True,
                              autosqueeze=False,
                              bucket="numpywrentop500test",
                              hash_keys=False)
        shard_matrix(X_sharded, X)
        print("Generating PSD matrix...")
        t = time.time()
        print(X_sharded.shape)
        XXT_sharded = binops.gemm(pwex,
                                  X_sharded,
                                  X_sharded.T,
                                  overwrite=False)
        e = time.time()
        print("GEMM took {0}".format(e - t))
    else:
        X_sharded = BigMatrix("cholesky_test_{0}_{1}".format(
            problem_size, shard_size),
                              autosqueeze=False,
                              hash_keys=False,
                              bucket="numpywrentop500test")
        key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T,
                                                  "gemm")
        XXT_sharded = BigMatrix(key_name,
                                hash_keys=False,
                                bucket="numpywrentop500test")
    XXT_sharded.lambdav = problem_size * 10
    if (verify):
        A = XXT_sharded.numpy()
        print("Computing local cholesky")
        L = np.linalg.cholesky(A)

    t = time.time()
    instructions, trailing, L_sharded = compiler._chol(XXT_sharded,
                                                       truncate=truncate)
    pipeline_width = args.pipeline
    if (lru):
        cache_size = 5
    else:
        cache_size = 0
    pywren_config = pwex.config
    config = npw.config.default()
    program = lp.LambdaPackProgram(instructions,
                                   executor=pywren.lambda_executor,
                                   pywren_config=pywren_config,
                                   num_priorities=num_priorities,
                                   eager=eager,
                                   config=config,
                                   write_limit=write_limit,
                                   read_limit=read_limit)
    warmup_start = time.time()
    if (warmup):
        warmup_sleep = 170

        def warmup_fn(x):
            program.incr_up(1)
            time.sleep(warmup_sleep)
            program.decr_up(1)

        print("Warming up...")
        futures = pwex.map(warmup_fn, range(max_cores))
        last_spinup = time.time()
        while (True):
            if ((time.time() - last_spinup) > 0.75 * warmup_sleep):
                print("Calling pwex.map..")
                futures = pwex.map(warmup_fn, range(max_cores))
                last_spinup = time.time()
            time.sleep(2)
            if (program.get_up() is None):
                up_workers = 0
            else:
                up_workers = int(program.get_up())
            print("{0} workers alive".format(up_workers))
            if (up_workers >= max_cores):
                time.sleep(warmup_sleep)
                break

    warmup_end = time.time()
    print("Warmup took {0} seconds".format(warmup_end - warmup_start))
    e = time.time()
    print("Program compile took {0} seconds".format(e - t))
    print("program.hash", program.hash)
    REDIS_CLIENT = program.control_plane.client
    done_counts = []
    ready_counts = []
    post_op_counts = []
    not_ready_counts = []
    running_counts = []
    sqs_invis_counts = []
    sqs_vis_counts = []
    up_workers_counts = []
    busy_workers_counts = []
    read_objects = []
    write_objects = []
    all_read_timeouts = []
    all_write_timeouts = []
    all_redis_timeouts = []
    times = [time.time()]
    flops = [0]
    reads = [0]
    writes = [0]
    print("LRU", lru)
    print("eager", eager)
    exp = {}
    exp["redis_done_counts"] = done_counts
    exp["redis_ready_counts"] = ready_counts
    exp["redis_post_op_counts"] = post_op_counts
    exp["redis_not_ready_counts"] = not_ready_counts
    exp["redis_running_counts"] = running_counts
    exp["sqs_invis_counts"] = sqs_invis_counts
    exp["sqs_vis_counts"] = sqs_vis_counts
    exp["busy_workers"] = busy_workers_counts
    exp["up_workers"] = up_workers_counts
    exp["times"] = times
    exp["lru"] = lru
    exp["priority"] = num_priorities
    exp["eager"] = eager
    exp["truncate"] = truncate
    exp["max_cores"] = max_cores
    exp["problem_size"] = problem_size
    exp["shard_size"] = shard_size
    exp["pipeline"] = pipeline
    exp["flops"] = flops
    exp["reads"] = reads
    exp["writes"] = writes
    exp["read_objects"] = read_objects
    exp["write_objects"] = write_objects
    exp["read_timeouts"] = all_read_timeouts
    exp["write_timeouts"] = all_write_timeouts
    exp["redis_timeouts"] = all_redis_timeouts
    exp["trial"] = trial
    exp["launch_granularity"] = launch_granularity
    exp["log_granularity"] = log_granularity
    exp["autoscale_policy"] = autoscale_policy
    exp["standalone"] = standalone
    exp["program"] = program
    exp["time_steps"] = 1
    exp["failed"] = False

    program.start()
    t = time.time()
    logger.info("Starting with {0} cores".format(start_cores))
    invoker = fs.ThreadPoolExecutor(1)
    all_future_futures = invoker.submit(lambda: pwex.map(
        lambda x: job_runner.lambdapack_run(program,
                                            pipeline_width=pipeline_width,
                                            cache_size=cache_size,
                                            timeout=timeout),
        range(start_cores),
        extra_env=extra_env))
    # print(all_future_futures.result())
    all_futures = [all_future_futures]
    # print([f.result() for f in all_futures])
    start_time = time.time()
    last_run_time = start_time
    print(program.program_status())
    print("QUEUE URLS", len(program.queue_urls))
    total_lambda_epochs = start_cores
    try:
        while (program.program_status() == lp.PS.RUNNING):
            time.sleep(log_granularity)
            curr_time = int(time.time() - start_time)
            p = program.get_progress()
            if (p is None):
                print("no progress...")
                continue
            else:
                p = int(p)
            times.append(int(time.time()))
            max_pc = p
            waiting = 0
            running = 0
            for i, queue_url in enumerate(program.queue_urls):
                client = boto3.client('sqs')
                attrs = client.get_queue_attributes(
                    QueueUrl=queue_url,
                    AttributeNames=[
                        'ApproximateNumberOfMessages',
                        'ApproximateNumberOfMessagesNotVisible'
                    ])['Attributes']
                waiting += int(attrs["ApproximateNumberOfMessages"])
                running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
            sqs_invis_counts.append(running)
            sqs_vis_counts.append(waiting)
            busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash))
            if (busy_workers == None):
                busy_workers = 0
            else:
                busy_workers = int(busy_workers)
            up_workers = program.get_up()

            if (up_workers == None):
                up_workers = 0
            else:
                up_workers = int(up_workers)
            up_workers_counts.append(up_workers)
            busy_workers_counts.append(busy_workers)

            logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))
            if ((curr_time % INFO_FREQ) == 0):
                logger.info("Waiting: {0}, Currently Processing: {1}".format(
                    waiting, running))
                logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                    up_workers, busy_workers, curr_time))

            current_gflops = program.get_flops()
            if (current_gflops is None):
                current_gflops = 0
            else:
                current_gflops = int(current_gflops) / 1e9

            flops.append(current_gflops)
            current_gbytes_read = program.get_read()
            if (current_gbytes_read is None):
                current_gbytes_read = 0
            else:
                current_gbytes_read = int(current_gbytes_read) / 1e9

            reads.append(current_gbytes_read)
            current_gbytes_write = program.get_write()
            if (current_gbytes_write is None):
                current_gbytes_write = 0
            else:
                current_gbytes_write = int(current_gbytes_write) / 1e9
            writes.append(current_gbytes_write)

            gflops_rate = flops[-1] / (times[-1] - times[0])
            greads_rate = reads[-1] / (times[-1] - times[0])
            gwrites_rate = writes[-1] / (times[-1] - times[0])
            b = XXT_sharded.shard_sizes[0]
            current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8)
            current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8)
            read_objects.append(current_objects_read)
            write_objects.append(current_objects_write)
            read_rate = read_objects[-1] / (times[-1] - times[0])
            write_rate = write_objects[-1] / (times[-1] - times[0])

            avg_workers = np.mean(up_workers_counts)
            smooth_len = 10
            if (len(flops) > smooth_len + 5):
                gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / (
                    times[-1] - times[-smooth_len])
                gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / (
                    times[-1] - times[-smooth_len])
                gwrite_rate_5_min_window = (
                    writes[-1] - writes[-smooth_len]) / (times[-1] -
                                                         times[-smooth_len])
                read_rate_5_min_window = (read_objects[-1] -
                                          read_objects[-smooth_len]) / (
                                              times[-1] - times[-smooth_len])
                write_rate_5_min_window = (write_objects[-1] -
                                           write_objects[-smooth_len]) / (
                                               times[-1] - times[-smooth_len])
                workers_5_min_window = np.mean(up_workers_counts[-smooth_len:])
            else:
                gflops_rate_5_min_window = "N/A"
                gread_rate_5_min_window = "N/A"
                gwrite_rate_5_min_window = "N/A"
                workers_5_min_window = "N/A"
                read_rate_5_min_window = "N/A"
                write_rate_5_min_window = "N/A"

            read_timeouts = int(REDIS_CLIENT.get("s3.timeouts.read"))
            write_timeouts = int(REDIS_CLIENT.get("s3.timeouts.write"))
            redis_timeouts = int(REDIS_CLIENT.get("redis.timeouts"))
            all_read_timeouts.append(read_timeouts)
            all_write_timeouts.append(write_timeouts)
            all_redis_timeouts.append(redis_timeouts)
            read_timeouts_fraction = read_timeouts / current_objects_read
            write_timeouts_fraction = write_timeouts / current_objects_write
            print("=======================================")
            print("Max PC is {0}".format(max_pc))
            print("Waiting: {0}, Currently Processing: {1}".format(
                waiting, running))
            print("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))
            print(
                "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}"
                .format(curr_time, current_gflops, current_gbytes_read,
                        current_gbytes_write))
            print(
                "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write  rate {3}, Average Worker Count {4}"
                .format(curr_time, gflops_rate, greads_rate, gwrites_rate,
                        avg_workers))
            print("{0}: Average read txns/s {1}, Average write txns/s {2}".
                  format(curr_time, read_rate, write_rate))
            print(
                "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write  rate {3}, smoothed Worker Count {4}"
                .format(curr_time, gflops_rate_5_min_window,
                        gread_rate_5_min_window, gwrite_rate_5_min_window,
                        workers_5_min_window))
            print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}".
                  format(curr_time, read_rate_5_min_window,
                         write_rate_5_min_window))
            print(
                "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3}  "
                .format(curr_time, read_timeouts, write_timeouts,
                        redis_timeouts))
            print(
                "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}"
                .format(curr_time, read_timeouts_fraction,
                        write_timeouts_fraction))
            print("=======================================")

            time_since_launch = time.time() - last_run_time
            if (autoscale_policy == "dynamic"):
                if (time_since_launch > launch_granularity and
                        up_workers < np.ceil(waiting * 0.5 / pipeline_width)
                        and up_workers < max_cores):
                    cores_to_launch = int(
                        min(
                            np.ceil(waiting / pipeline_width) - up_workers,
                            max_cores - up_workers))
                    logger.info(
                        "launching {0} new tasks....".format(cores_to_launch))
                    new_future_futures = invoker.submit(
                        lambda: pwex.map(lambda x: job_runner.lambdapack_run(
                            program,
                            pipeline_width=pipeline_width,
                            cache_size=cache_size,
                            timeout=timeout),
                                         range(cores_to_launch),
                                         extra_env=extra_env))
                    last_run_time = time.time()
                    # check if we OOM-erred
                    # [x.result() for x in all_futures]
                    all_futures.extend(new_future_futures)
            elif (autoscale_policy == "constant_timeout"):
                if (time_since_launch > (0.85 * timeout)):
                    cores_to_launch = max_cores
                    logger.info(
                        "launching {0} new tasks....".format(cores_to_launch))
                    new_future_futures = invoker.submit(
                        lambda: pwex.map(lambda x: job_runner.lambdapack_run(
                            program,
                            pipeline_width=pipeline_width,
                            cache_size=cache_size,
                            timeout=timeout),
                                         range(cores_to_launch),
                                         extra_env=extra_env))
                    last_run_time = time.time()
                    # check if we OOM-erred
                    # [x.result() for x in all_futures]
                    all_futures.append(new_future_futures)
            else:
                raise Exception("unknown autoscale policy")
            exp["time_steps"] += 1
        if (verify):
            L_sharded_local = L_sharded.numpy()
            print("max diff", np.max(np.abs(L_sharded_local - L)))
    except KeyboardInterrupt:
        exp["failed"] = True
        program.stop()
        pass
    except Exception as e:
        traceback.print_exc()
        exp["failed"] = True
        program.stop()
        raise
        pass
    print(program.program_status())
    exp["all_futures"] = all_futures
    exp_bytes = dill.dumps(exp)
    client = boto3.client('s3')
    client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash),
                      Body=exp_bytes,
                      Bucket=program.bucket)
    print("=======================")
    print("=======================")
    print("Execution Summary:")
    print("Executed Program ID: {0}".format(program.hash))
    print("Program Success: {0}".format((not exp["failed"])))
    print("Problem Size: {0}".format(exp["problem_size"]))
    print("Shard Size: {0}".format(exp["shard_size"]))
    print("Total Execution time: {0}".format(times[-1] - times[0]))
    print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] /
                                                    (times[-1] - times[0])))
    with open("/tmp/last_run", "w+") as f:
        f.write(program.hash)
from numpywren.matrix_init import shard_matrix
from numpywren.binops import gemm
import pywren
pwex = pywren.lambda_executor()

Ns = [5000, 10000, 15000, 20000, 25000, 30000]
shard_size = (5000, 5000)

np.random.seed(42)

# Only run this if matrices not already in the bucket.
# This takes a very long time (for 30000x30000xf64 - 8GB of data)
# Big_X = BigMatrix("multiply_test2", shape=(max(Ns),max(Ns), shard_sizes=shard_size)
# for i in range():
#     for j in range():
#         X = np.random.randn(5000,5000)
#         Big_X.put_block(X, i, j)

# start = time.time()
# gemm(pwex, Big_X, Big_X, Big_X.bucket, 1)
# end = time.time()
# print(end - start)

for N in Ns:
    X_sharded = BigMatrix("multiply_test2",
                          shape=(N, N),
                          shard_sizes=shard_size)
    start = time.time()
    gemm(pwex, X_sharded, X_sharded, X_sharded.bucket, 1)
    end = time.time()
    print(end - start)