コード例 #1
0
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    assert dataset_type == "sparse_libsvm"
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Sparse_Linear_Models
    assert optim.lower() in Optimization.All
    assert sync_mode.lower() in Synchronization.All

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']

    shuffle_dataset = True
    random_seed = 100

    print('bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))

    storage = S3Storage()
    communicator = S3Communicator(storage, tmp_bucket, merged_bucket,
                                  n_workers, worker_index)

    # Read file from s3
    read_start = time.time()
    lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # split train set and test set
    train_set = [dataset[i] for i in train_indices]
    n_train_batch = math.floor(len(train_set) / batch_size)
    val_set = [dataset[i] for i in val_indices]
    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = linear_models.get_sparse_model(model_name, train_set, val_set,
                                           n_features, n_epochs, learning_rate,
                                           batch_size)

    train_start = time.time()
    # Training the Model
    for epoch in range(n_epochs):
        epoch_start = time.time()
        epoch_cal_time = 0
        epoch_comm_time = 0
        epoch_loss = 0.

        for batch_idx in range(n_train_batch):
            batch_start = time.time()
            batch_loss, batch_acc = model.one_batch()
            epoch_loss += batch_loss.average

            if optim == "grad_avg":
                if sync_mode == "reduce" or sync_mode == "reduce_scatter":
                    w_b = np.concatenate((model.weight.numpy().flatten(),
                                          np.array([model.bias],
                                                   dtype=np.float32)))
                    batch_cal_time = time.time() - batch_start
                    epoch_cal_time += batch_cal_time

                    batch_comm_start = time.time()
                    postfix = "{}_{}".format(epoch, batch_idx)

                    if sync_mode == "reduce":
                        w_b_merge = communicator.reduce_batch(w_b, postfix)
                    elif sync_mode == "reduce_scatter":
                        w_b_merge = communicator.reduce_scatter_batch(
                            w_b, postfix)

                    w_merge = w_b_merge[:n_features] / float(n_workers)
                    b_merge = w_b_merge[-1] / float(n_workers)
                    model.weight = torch.from_numpy(w_merge).reshape(
                        n_features, 1)
                    model.bias = float(b_merge)

                    batch_comm_time = time.time() - batch_comm_start
                    print("one {} round cost {} s".format(
                        sync_mode, batch_comm_time))
                    epoch_comm_time += batch_comm_time
                elif sync_mode == "async":
                    w_b = np.concatenate((model.weight.numpy().flatten(),
                                          np.array([model.bias],
                                                   dtype=np.float32)))
                    batch_cal_time = time.time() - batch_start
                    epoch_cal_time += batch_cal_time

                    batch_comm_start = time.time()
                    # init model
                    if worker_index == 0 and epoch == 0 and batch_idx == 0:
                        storage.save(w_b.tobytes(), Prefix.w_b_prefix,
                                     merged_bucket)

                    w_b_merge = communicator.async_reduce(
                        w_b, Prefix.w_b_prefix)
                    # async des not need average
                    w_merge = w_b_merge[:n_features]
                    b_merge = w_b_merge[-1]
                    model.weight = torch.from_numpy(w_merge).reshape(
                        n_features, 1)
                    model.bias = float(b_merge)

                    batch_comm_time = time.time() - batch_comm_start
                    print("one {} round cost {} s".format(
                        sync_mode, batch_comm_time))
                    epoch_comm_time += batch_comm_time

            if batch_idx % 10 == 0:
                print(
                    'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f s, Loss: %.4f, Accuracy: %.4f, batch cost %.4f s'
                    % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                       time.time() - train_start, batch_loss.average,
                       batch_acc.accuracy, time.time() - batch_start))

        if optim == "model_avg":
            w_b = np.concatenate((model.weight.numpy().flatten(),
                                  np.array([model.bias], dtype=np.float32)))
            epoch_cal_time += time.time() - epoch_start

            epoch_sync_start = time.time()
            postfix = str(epoch)

            if sync_mode == "reduce":
                w_b_merge = communicator.reduce_epoch(w_b, postfix)
            elif sync_mode == "reduce_scatter":
                w_b_merge = communicator.reduce_scatter_epoch(w_b, postfix)
            elif sync_mode == "async":
                if worker_index == 0 and epoch == 0:
                    storage.save(w_b.tobytes(), Prefix.w_b_prefix,
                                 merged_bucket)
                w_b_merge = communicator.async_reduce(w_b, Prefix.w_b_prefix)

            w_merge = w_b_merge[:n_features]
            b_merge = w_b_merge[-1]
            # async des not need average
            if sync_mode == "reduce" or sync_mode == "reduce_scatter":
                w_merge = w_merge / float(n_workers)
                b_merge = b_merge / float(n_workers)
            model.weight = torch.from_numpy(w_merge).reshape(n_features, 1)
            model.bias = float(b_merge)
            print("one {} round cost {} s".format(
                sync_mode,
                time.time() - epoch_sync_start))
            epoch_comm_time += time.time() - epoch_sync_start

        if worker_index == 0:
            delete_start = time.time()
            # model avg delete by epoch
            if optim == "model_avg" and sync_mode != "async":
                communicator.delete_expired_epoch(epoch)
            elif optim == "grad_avg" and sync_mode != "async":
                communicator.delete_expired_batch(epoch, batch_idx)
            epoch_comm_time += time.time() - delete_start

        # Test the Model
        test_start = time.time()
        test_loss, test_acc = model.evaluate()
        test_time = time.time() - test_start

        print(
            "Epoch: [{}/{}] finishes, Batch: [{}/{}], Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, "
            "calculation cost = {:.4f} s, synchronization cost {:.4f} s, test cost {:.4f} s, "
            "accuracy of the model on the {} test samples: {}, loss = {}".
            format(epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                   time.time() - train_start, epoch_loss,
                   time.time() - epoch_start, epoch_cal_time, epoch_comm_time,
                   test_time, len(val_set), test_acc.accuracy,
                   test_loss.average))

    if worker_index == 0:
        storage.clear(tmp_bucket)
        storage.clear(merged_bucket)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
コード例 #2
0
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Sparse_Linear_Models
    assert optim.lower() == Optimization.ADMM
    assert sync_mode.lower() in [Synchronization.Reduce, Synchronization.Reduce_Scatter]

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']
    n_admm_epochs = event['n_admm_epochs']
    lam = event['lambda']
    rho = event['rho']

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))

    storage = S3Storage()
    communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index)

    # Read file from s3
    read_start = time.time()
    lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))
    shuffle_dataset = True
    random_seed = 100
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # split train set and test set
    train_set = [dataset[i] for i in train_indices]
    n_train_batch = math.floor(len(train_set) / batch_size)
    val_set = [dataset[i] for i in val_indices]
    print("preprocess data cost {} s, dataset size = {}"
          .format(time.time() - preprocess_start, dataset_size))

    model = linear_models.get_sparse_model(model_name, train_set, val_set, n_features,
                                           n_epochs, learning_rate, batch_size)

    z, u = initialize_z_and_u(model.weight.data.size())
    print("size of z = {}".format(z.shape))
    print("size of u = {}".format(u.shape))

    # Training the Model
    train_start = time.time()
    for admm_epoch in range(n_admm_epochs):
        print(">>> ADMM Epoch[{}]".format(admm_epoch + 1))
        admm_epoch_start = time.time()
        admm_epoch_cal_time = 0
        admm_epoch_comm_time = 0
        admm_epoch_test_time = 0
        for epoch in range(n_epochs):
            epoch_start = time.time()
            epoch_loss = 0.

            for batch_idx in range(n_train_batch):
                batch_start = time.time()
                batch_loss, batch_acc = model.one_batch()

                u_z = torch.from_numpy(u) - torch.from_numpy(z)
                new_grad = torch.add(model.weight, u_z).mul(rho)
                new_grad.mul_(-1.0 * learning_rate)

                model.weight.add_(new_grad)
                batch_loss = batch_loss.average + rho / 2.0 * torch.norm(model.weight + u_z, p=2).item()
                epoch_loss += batch_loss

                if batch_idx % 10 == 0:
                    print("ADMM Epoch: [{}/{}], Epoch: [{}/{}], Batch: [{}/{}], "
                          "time: {:.4f} s, batch cost {:.4f} s, loss: {}, accuracy: {}"
                          .format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                                  time.time() - train_start, time.time() - batch_start,
                                  batch_loss, batch_acc))

            epoch_cal_time = time.time() - epoch_start
            admm_epoch_cal_time += epoch_cal_time

            # Test the Model
            test_start = time.time()
            test_loss, test_acc = model.evaluate()
            epoch_test_time = time.time() - test_start
            admm_epoch_test_time += epoch_test_time

            print("ADMM Epoch: [{}/{}] Epoch: [{}/{}] finishes, Batch: [{}/{}], "
                  "Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, "
                  "calculation cost = {:.4f} s, test cost {:.4f} s, "
                  "accuracy of the model on the {} test samples: {}, loss = {}"
                  .format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                          time.time() - train_start, epoch_loss, time.time() - epoch_start,
                          epoch_cal_time, epoch_test_time,
                          len(val_set), test_acc, test_loss))

        sync_start = time.time()
        w = model.weight.numpy()
        w_shape = w.shape
        b = np.array([model.bias], dtype=np.float32)
        b_shape = b.shape
        u_shape = u.shape

        w_b = np.concatenate((w.flatten(), b.flatten()))
        u_w_b = np.concatenate((u.flatten(), w_b.flatten()))

        postfix = "{}".format(admm_epoch)

        # admm does not support async
        if sync_mode == "reduce":
            u_w_b_merge = communicator.reduce_epoch(u_w_b, postfix)
        elif sync_mode == "reduce_scatter":
            u_w_b_merge = communicator.reduce_scatter_epoch(u_w_b, postfix)

        u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(n_workers)
        w_mean = u_w_b_merge[u_shape[0] * u_shape[1]: u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]]\
                     .reshape(w_shape) / float(n_workers)
        b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:]\
                     .reshape(b_shape[0]) / float(n_workers)

        model.weight = torch.from_numpy(w_mean)
        model.bias = torch.from_numpy(b_mean)
        admm_epoch_comm_time += time.time() - sync_start
        print("one {} round cost {} s".format(sync_mode, admm_epoch_comm_time))

        if worker_index == 0:
            delete_start = time.time()
            communicator.delete_expired_epoch(admm_epoch)
            admm_epoch_comm_time += time.time() - delete_start

        # z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam)
        # stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho)
        # print("stop = {}".format(stop))

        # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean)
        z = update_z(w_mean, u_mean, rho, n_workers, lam)
        u = u + model.weight.data.numpy() - z

        print("ADMM Epoch[{}] finishes, cost {} s, cal cost {} s, comm cost {} s, test cost {} s"
              .format(admm_epoch, time.time() - admm_epoch_start,
                      admm_epoch_cal_time, admm_epoch_comm_time, admm_epoch_test_time))

    # Test the Model
    test_loss, test_acc = model.evaluate()

    print("Train finish, cost {} s, accuracy of the model on the {} test samples = {}, loss = {}"
          .format(time.time() - train_start, len(val_set), test_acc, test_loss))

    if worker_index == 0:
        storage.clear(tmp_bucket)
        storage.clear(merged_bucket)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
コード例 #3
0
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    assert dataset_type == "sparse_libsvm"
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']

    # ps setting
    host = event['host']
    port = event['port']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Sparse_Linear_Models
    assert optim.lower() == Optimization.Grad_Avg
    assert sync_mode.lower() == Synchronization.Reduce

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']

    print('bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('host = {}'.format(host))
    print('port = {}'.format(port))

    # Set thrift connection
    # Make socket
    transport = TSocket.TSocket(host, port)
    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)
    # Wrap in a protocol
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    # Create a client to use the protocol encoder
    t_client = ParameterServer.Client(protocol)
    # Connect!
    transport.open()
    # test thrift connection
    ps_client.ping(t_client)
    print("create and ping thrift server >>> HOST = {}, PORT = {}".format(
        host, port))

    # Read file from s3
    read_start = time.time()
    storage = S3Storage()
    lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))

    shuffle_dataset = True
    random_seed = 100
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # split train set and test set
    train_set = [dataset[i] for i in train_indices]
    n_train_batch = math.floor(len(train_set) / batch_size)
    val_set = [dataset[i] for i in val_indices]
    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = linear_models.get_sparse_model(model_name, train_set, val_set,
                                           n_features, n_epochs, learning_rate,
                                           batch_size)

    # register model
    model_name = "w.b"
    weight_length = n_features
    bias_length = 1
    model_length = weight_length + bias_length
    ps_client.register_model(t_client, worker_index, model_name, model_length,
                             n_workers)
    ps_client.exist_model(t_client, model_name)
    print("register and check model >>> name = {}, length = {}".format(
        model_name, model_length))

    # Training the Model
    train_start = time.time()
    iter_counter = 0
    for epoch in range(n_epochs):
        epoch_start = time.time()
        epoch_cal_time = 0
        epoch_comm_time = 0
        epoch_loss = 0.

        for batch_idx in range(n_train_batch):
            batch_start = time.time()
            batch_comm_time = 0

            # pull latest model
            ps_client.can_pull(t_client, model_name, iter_counter,
                               worker_index)
            latest_model = ps_client.pull_model(t_client, model_name,
                                                iter_counter, worker_index)
            model.weight = torch.from_numpy(
                np.asarray(latest_model[:weight_length]).astype(
                    np.float32).reshape(n_features, 1))
            model.bias = float(latest_model[-1])
            batch_comm_time += time.time() - batch_start

            batch_loss, batch_acc = model.one_batch()
            epoch_loss += batch_loss.average

            w_b = np.concatenate((model.weight.double().numpy().flatten(),
                                  np.array([model.bias]).astype(np.double)))
            w_b_update = np.subtract(w_b, latest_model)
            batch_cal_time = time.time() - batch_start

            # push gradient to PS
            batch_comm_start = time.time()
            ps_client.can_push(t_client, model_name, iter_counter,
                               worker_index)
            ps_client.push_grad(t_client, model_name, w_b_update,
                                1.0 / n_workers, iter_counter, worker_index)
            ps_client.can_pull(t_client, model_name, iter_counter + 1,
                               worker_index)  # sync all workers
            batch_comm_time += time.time() - batch_comm_start

            epoch_cal_time += batch_cal_time
            epoch_comm_time += batch_comm_time

            if batch_idx % 10 == 0:
                print(
                    'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f, Loss: %.4f, Accuracy: %.4f,'
                    'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s'
                    % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                       time.time() - train_start, batch_loss.average,
                       batch_acc.accuracy, time.time() - batch_start,
                       batch_cal_time, batch_comm_time))

            iter_counter += 1

        # Test the Model
        test_start = time.time()
        test_loss, test_acc = model.evaluate()
        test_time = time.time() - test_start

        print(
            "Epoch: [{}/{}] finishes, Batch: [{}/{}], Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, "
            "calculation cost = {:.4f} s, synchronization cost {:.4f} s, test cost {:.4f} s, "
            "accuracy of the model on the {} test samples: {}, loss = {}".
            format(epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                   time.time() - train_start, epoch_loss,
                   time.time() - epoch_start, epoch_cal_time, epoch_comm_time,
                   test_time, len(val_set), test_acc.accuracy,
                   test_loss.average))

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))