def run(args):
    """ Distributed Synchronous SGD Example """
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    feature_file_name = "{}/features_{}_{}.npy".format(args.root, args.rank,
                                                       args.world_size)
    label_file_name = "{}/labels_{}_{}.npy".format(args.root, args.rank,
                                                   args.world_size)

    print("read feature file {}".format(feature_file_name))
    print("read label file {}".format(label_file_name))
    train_loader, test_loader = partition_vgg16_cifar100_fc(
        args.batch_size, feature_file_name, label_file_name, validation_ratio,
        args.shuffle)
    num_batches = ceil(len(train_loader.dataset) / float(args.batch_size))

    model = LogisticRegression(args.features, args.classes).float()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    trainer = Trainer(model, optimizer, train_loader, test_loader, device)

    trainer.fit(args.epochs, is_dist=dist_is_initialized())
Beispiel #2
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket']
    key = event['name']
    num_features = event['num_features']
    num_classes = event['num_classes']
    elasti_location = event['elasticache']
    endpoint = memcached_init(elasti_location)
    print('bucket = {}'.format(bucket))
    print('key = {}'.format(key))

    key_splits = key.split("_")
    worker_index = int(key_splits[0])
    num_worker = event['num_files']
    batch_size = 100000
    batch_size = int(np.ceil(batch_size / num_worker))

    torch.manual_seed(random_seed)

    # read file(dataset) from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))
    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    preprocess_start = time.time()
    print("libsvm operation cost {}s".format(parse_start - preprocess_start))

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    print("dataset size = {}".format(dataset_size))
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_loss = []
    test_loss = []
    test_acc = []
    total_time = 0
    # Training the Model
    epoch_start = time.time()
    for epoch in range(num_epochs):
        tmp_train = 0
        for batch_index, (items, labels) in enumerate(train_loader):
            #batch_start = time.time()
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            w = model.linear.weight.data.numpy()
            b = model.linear.bias.data.numpy()
            file_postfix = "{}_{}".format(batch_index, epoch)
            #asynchronization / shuffle starts from that every worker writes their gradients of this batch and epoch
            #upload individual gradient
            if batch_index == 0 and epoch == 0:
                hset_object(endpoint, model_bucket, w_prefix, w.tobytes())
                hset_object(endpoint, model_bucket, b_prefix, b.tobytes())
                time.sleep(0.0001)
                #randomly get one from others. (Asynchronized)
                w_new = np.fromstring(hget_object(endpoint, model_bucket,
                                                  w_prefix),
                                      dtype=w.dtype).reshape(w.shape)
                b_new = np.fromstring(hget_object(endpoint, model_bucket,
                                                  b_prefix),
                                      dtype=b.dtype).reshape(b.shape)
            else:
                w_new = np.fromstring(hget_object(endpoint, model_bucket,
                                                  w_prefix),
                                      dtype=w.dtype).reshape(w.shape)
                b_new = np.fromstring(hget_object(endpoint, model_bucket,
                                                  b_prefix),
                                      dtype=b.dtype).reshape(b.shape)
                hset_object(endpoint, model_bucket, w_prefix, w.tobytes())
                hset_object(endpoint, model_bucket, b_prefix, b.tobytes())
            model.linear.weight.data = torch.from_numpy(w_new)
            model.linear.bias.data = torch.from_numpy(b_new)

            #report train loss and test loss for every mini batch
            if (batch_index + 1) % 1 == 0:
                print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' %
                      (epoch + 1, num_epochs, batch_index + 1,
                       len(train_indices) / batch_size, loss.data))
            tmp_train += loss.item()
        total_time += time.time() - epoch_start
        train_loss.append(tmp_train)

        tmp_test, tmp_acc = test(model, validation_loader, criterion)
        test_loss.append(tmp_test)
        test_acc.append(tmp_acc)
        epoch_start = time.time()

    print("total time = {}".format(total_time))
    end_time = time.time()
    print("elapsed time = {} s".format(end_time - start_time))
    loss_record = [test_loss, test_acc, train_loss, total_time]
    put_object("async-model-loss", "async-loss{}".format(worker_index),
               pickle.dumps(loss_record))
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    num_buckets = event['num_buckets']
    key = event['file']
    tmp_bucket_prefix = event['tmp_bucket_prefix']
    merged_bucket_prefix = event['merged_bucket_prefix']

    print('bucket = {}'.format(bucket))
    print('number of workers = {}'.format(num_workers))
    print('number of buckets = {}'.format(num_buckets))
    print('worker index = {}'.format(worker_index))
    print("file = {}".format(key))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    for epoch in range(num_epochs):
        epoch_start = time.time()
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()

            print("forward and backward cost {} s".format(time.time() -
                                                          batch_start))

            w_grad = model.linear.weight.grad.data.numpy()
            w_grad_shape = w_grad.shape
            b_grad = model.linear.bias.grad.data.numpy()
            b_grad_shape = b_grad.shape

            w_b_grad = np.concatenate((w_grad.flatten(), b_grad.flatten()))
            cal_time = time.time() - batch_start

            sync_start = time.time()
            postfix = "{}_{}".format(epoch, batch_index)
            w_b_grad_merge = \
                reduce_scatter_batch_multi_bucket(w_b_grad, tmp_bucket_prefix, merged_bucket_prefix,
                                                  num_buckets, num_workers, worker_index, postfix)
            w_grad_merge = \
                w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]].reshape(w_grad_shape) / float(num_workers)
            b_grad_merge = \
                w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:].reshape(b_grad_shape[0]) / float(num_workers)

            model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge))
            model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge))
            sync_time = time.time() - sync_start

            optimizer.step()

            print(
                'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s'
                % (epoch + 1, num_epochs, batch_index + 1,
                   len(train_indices) / batch_size, time.time() - train_start,
                   loss.data, time.time() - epoch_start,
                   time.time() - batch_start, cal_time, sync_time))

        if worker_index == 0:
            for i in range(num_buckets):
                delete_expired_merged("{}_{}".format(merged_bucket_prefix, i),
                                      epoch)

        # Test the Model
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()

        print(
            'Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
            % (time.time() - train_start, len(val_indices),
               100 * correct / total, test_loss))

    if worker_index == 0:
        for i in range(num_buckets):
            clear_bucket("{}_{}".format(merged_bucket_prefix, i))
            clear_bucket("{}_{}".format(tmp_bucket_prefix, i))

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context):
    start_time = time.time()

    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']
    num_classes = event['num_classes']
    num_features = event['num_features']
    num_epochs = event['num_epochs']
    learning_rate = event['learning_rate']
    batch_size = event['batch_size']

    print('bucket = {}'.format(bucket))
    print("file = {}".format(key))
    print('tmp bucket = {}'.format(tmp_bucket))
    print('merged bucket = {}'.format(merged_bucket))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('num classes = {}'.format(num_classes))
    print('num features = {}'.format(num_features))
    print('num epochs = {}'.format(num_epochs))
    print('learning rate = {}'.format(learning_rate))
    print("batch size = {}".format(batch_size))

    s3 = boto3.client('s3')

    feature_file_name = "features_{}_{}.npy".format(worker_index, num_workers)
    label_file_name = "labels_{}_{}.npy".format(worker_index, num_workers)

    # read file from s3
    s3.download_file(bucket, feature_file_name, local_dir + str(feature_file_name))
    features_matrix = np.load(local_dir + str(feature_file_name))
    print("read features matrix cost {} s".format(time.time() - start_time))
    print("feature matrix shape = {}, dtype = {}".format(features_matrix.shape, features_matrix.dtype))
    print("feature matrix sample = {}".format(features_matrix[0]))
    row_features = features_matrix.shape[0]
    col_features = features_matrix.shape[1]

    s3.download_file(bucket, label_file_name, local_dir + str(label_file_name))
    labels_matrix = np.load(local_dir + str(label_file_name))
    print("read label matrix cost {} s".format(time.time() - start_time))
    print("label matrix shape = {}, dtype = {}".format(labels_matrix.shape, labels_matrix.dtype))
    print("label matrix sample = {}".format(labels_matrix[0:10]))
    row_labels = labels_matrix.shape[0]

    if row_features != row_labels:
        raise AssertionError("row of feature matrix is {}, but row of label matrix is {}."
                             .format(row_features, row_labels))

    parse_start = time.time()
    dataset = DenseDatasetWithNP(col_features, features_matrix, labels_matrix)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)

    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s, dataset size = {}"
          .format(time.time() - preprocess_start, dataset_size))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_loss = 0
        for batch_index, (items, labels) in enumerate(train_loader):
            #   print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()

            # print("forward and backward cost {} s".format(time.time() - batch_start))
            w_grad = model.linear.weight.grad.data.numpy()
            w_grad_shape = w_grad.shape
            b_grad = model.linear.bias.grad.data.numpy()
            b_grad_shape = b_grad.shape

            w_b_grad = np.concatenate((w_grad.flatten(), b_grad.flatten()))
            cal_time = time.time() - batch_start

            sync_start = time.time()
            postfix = "{}_{}".format(epoch, batch_index)
            w_b_grad_merge = reduce_batch(w_b_grad, tmp_bucket, merged_bucket,
                                          num_workers, worker_index, postfix)
            w_grad_merge = \
                w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]].reshape(w_grad_shape) / float(num_workers)
            b_grad_merge = \
                w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:].reshape(b_grad_shape[0]) / float(num_workers)

            model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge))
            model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge))
            sync_time = time.time() - sync_start

            optimizer.step()

            # Test the Model
            test_start = time.time()
            correct = 0
            total = 0
            test_loss = 0
            for items, labels in validation_loader:
                items = Variable(items.view(-1, num_features))
                labels = Variable(labels)
                outputs = model(items)
                test_loss += criterion(outputs, labels).data
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            test_time = time.time() - test_start

            print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                  'batch cost %.4f s: cal cost %.4f s communication cost %.4f s test cost %.4f s, '
                  'accuracy of the model on the %d test samples: %d %%, loss = %f'
                  % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size,
                     time.time() - train_start, loss.data, time.time() - epoch_start,
                     time.time() - batch_start, cal_time, sync_time, test_time,
                     len(val_indices), 100 * correct / total, test_loss / total))

        if worker_index == 0:
            delete_expired_merged_batch(merged_bucket, epoch, batch_index)

        # Test the Model
        test_start = time.time()
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()

        print('Epoch: %d, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
              % (epoch, time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total))

    if worker_index == 0:
        clear_bucket(merged_bucket)
        clear_bucket(tmp_bucket)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #5
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket']
    key = event['name']
    num_features = event['num_features']
    num_classes = event['num_classes']
    redis_location = event['elasticache']
    endpoint = redis_init(redis_location)
    print('bucket = {}'.format(bucket))
    print('key = {}'.format(key))

    key_splits = key.split("_")
    num_worker = event['num_files']
    worker_index = event['worker_index']

    batch_size = 100000
    batch_size = int(np.ceil(batch_size / num_worker))

    torch.manual_seed(random_seed)

    # read file(dataset) from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))
    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    preprocess_start = time.time()
    print("libsvm operation cost {}s".format(parse_start - preprocess_start))

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    print("dataset size = {}".format(dataset_size))
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_loss = []
    test_loss = []
    test_acc = []
    epoch_time = 0
    epoch_start = time.time()
    # Training the Model
    for epoch in range(num_epochs):
        tmp_train = 0
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()

            w_grad = model.linear.weight.grad.data.numpy()
            b_grad = model.linear.bias.grad.data.numpy()

            #synchronization starts from that every worker writes their gradients of this batch and epoch
            sync_start = time.time()
            hset_object(endpoint, grad_bucket,
                        w_grad_prefix + str(worker_index), w_grad.tobytes())
            hset_object(endpoint, grad_bucket,
                        b_grad_prefix + str(worker_index), b_grad.tobytes())
            tmp_write_local_epoch_time = time.time() - sync_start
            print("write local gradient cost = {}".format(
                tmp_write_local_epoch_time))

            #merge gradients among files
            file_postfix = "{}_{}".format(epoch, batch_index)
            if worker_index == 0:

                w_grad_merge, b_grad_merge = \
                    merge_w_b_grads(endpoint,
                                    grad_bucket, num_worker, w_grad.dtype,
                                    w_grad.shape, b_grad.shape,
                                    w_grad_prefix, b_grad_prefix)
                put_merged_w_b_grads(endpoint, model_bucket, w_grad_merge,
                                     b_grad_merge, file_postfix, w_grad_prefix,
                                     b_grad_prefix)
                hset_object(endpoint, model_bucket, "epoch", epoch)
                hset_object(endpoint, model_bucket, "index", batch_index)
            else:
                w_grad_merge, b_grad_merge = get_merged_w_b_grads(
                    endpoint, model_bucket, file_postfix, w_grad.dtype,
                    w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix)

            model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge))
            model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge))
            tmp_sync_time = time.time() - sync_start
            print("synchronization cost {} s".format(tmp_sync_time))

            optimizer.step()

            tmp_train = tmp_train + loss.item()
            train_loss.append(tmp_train / (batch_index + 1))

        epoch_time += time.time() - epoch_start
        # Test the Model
        correct = 0
        total = 0
        tmp_test = 0
        count = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            outputs = model(items)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
            loss = criterion(outputs, labels)
            tmp_test = tmp_test + loss.item()
            count += 1
        print('Accuracy of the model on the %d test samples: %d %%' %
              (len(val_indices), 100 * correct / total))
        test_loss.append(tmp_test / count)
        test_acc.append(100 * correct / total)
        epoch_start = time.time()

    loss_record = [test_loss, test_acc, train_loss, epoch_time]
    put_object("grad-average-loss", "grad-loss{}".format(worker_index),
               bytes(loss_record))
Beispiel #6
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']
    lam = event['lambda']
    rho = event['rho']

    print('bucket = {}'.format(bucket))
    print("file = {}".format(key))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('tmp bucket = {}'.format(tmp_bucket))
    print('merge bucket = {}'.format(merged_bucket))
    print("lambda = {}".format(lam))
    print("rho = {}".format(rho))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))
    # file_path = "../../dataset/agaricus_127d_train.libsvm"
    # file = open(file_path).readlines()

    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)

    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = LogisticRegression(num_features, num_classes).double()
    print("size of w = {}".format(model.linear.weight.data.size()))

    z, u = initialize_z_and_u(model.linear.weight.data.size())
    print("size of z = {}".format(z.shape))
    print("size of u = {}".format(u.shape))

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    stop = False
    for admm_epoch in range(num_admm_epochs):
        print("ADMM Epoch >>> {}".format(admm_epoch))
        for epoch in range(num_epochs):
            epoch_start = time.time()
            epoch_loss = 0
            for batch_index, (items, labels) in enumerate(train_loader):
                #   print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
                batch_start = time.time()
                items = Variable(items.view(-1, num_features))
                labels = Variable(labels)

                # Forward + Backward + Optimize
                optimizer.zero_grad()
                outputs = model(items.double())
                classify_loss = criterion(outputs, labels)
                epoch_loss += classify_loss.data
                u_z = torch.from_numpy(u).double() - torch.from_numpy(
                    z).double()
                loss = classify_loss
                for name, param in model.named_parameters():
                    if name.split('.')[-1] == "weight":
                        loss += rho / 2.0 * torch.norm(param + u_z, p=2)
                #loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z))
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()

            # Test the Model
            test_start = time.time()
            correct = 0
            total = 0
            test_loss = 0
            for items, labels in validation_loader:
                items = Variable(items.view(-1, num_features))
                labels = Variable(labels)
                outputs = model(items.double())
                test_loss += criterion(outputs, labels).data
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            test_time = time.time() - test_start

            print(
                'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                'batch cost %.4f s: test cost %.4f s, '
                'accuracy of the model on the %d test samples: %d %%, loss = %f'
                % (epoch + 1, num_epochs, batch_index + 1,
                   len(train_indices) / batch_size, time.time() - train_start,
                   epoch_loss.data, time.time() - epoch_start,
                   time.time() - batch_start, test_time, len(val_indices),
                   100 * correct / total, test_loss / total))

        w = model.linear.weight.data.numpy()
        w_shape = w.shape
        b = model.linear.bias.data.numpy()
        b_shape = b.shape
        u_shape = u.shape

        w_and_b = np.concatenate((w.flatten(), b.flatten()))
        u_w_b = np.concatenate((u.flatten(), w_and_b.flatten()))
        cal_time = time.time() - epoch_start
        print("Epoch {} calculation cost = {} s".format(epoch, cal_time))

        sync_start = time.time()
        postfix = "{}_{}".format(admm_epoch, epoch)
        u_w_b_merge = reduce_scatter_batch(u_w_b, tmp_bucket, merged_bucket,
                                           num_workers, worker_index, postfix)
        u_mean = u_w_b_merge[:u_shape[0] *
                             u_shape[1]].reshape(u_shape) / float(num_workers)
        w_mean = u_w_b_merge[u_shape[0] * u_shape[1]:u_shape[0] * u_shape[1] +
                             w_shape[0] *
                             w_shape[1]].reshape(w_shape) / float(num_workers)
        b_mean = u_w_b_merge[u_shape[0] * u_shape[1] +
                             w_shape[0] * w_shape[1]:].reshape(
                                 b_shape[0]) / float(num_workers)
        #model.linear.weight.data = torch.from_numpy(w)
        model.linear.bias.data = torch.from_numpy(b_mean)
        sync_time = time.time() - sync_start
        print("Epoch {} synchronization cost {} s".format(epoch, sync_time))

        if worker_index == 0:
            delete_expired_merged(merged_bucket, epoch)

        #z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam)
        #stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho)
        #print("stop = {}".format(stop))

        #z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean)
        z = update_z(w_mean, u_mean, rho, num_workers, lam)
        #print(z)
        u = u + model.linear.weight.data.numpy() - z
        #print(u)

    # Test the Model
    correct = 0
    total = 0
    test_loss = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, num_features))
        labels = Variable(labels)
        outputs = model(items.double())
        test_loss += criterion(outputs, labels).data
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print(
        'Epoch: %d, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
        % (epoch, time.time() - train_start, len(val_indices),
           100 * correct / total, test_loss / total))

    if worker_index == 0:
        clear_bucket(merged_bucket)
        clear_bucket(tmp_bucket)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #7
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file']
    merged_bucket = event['merged_bucket']
    num_classes = event['num_classes']
    num_features = event['num_features']
    pos_tag = event['pos_tag']
    num_epochs = event['num_epochs']
    learning_rate = event['learning_rate']
    batch_size = event['batch_size']
    elasti_location = event['elasticache']
    endpoint = memcached_init(elasti_location)

    print('bucket = {}'.format(bucket))
    print("file = {}".format(key))
    print('merged bucket = {}'.format(merged_bucket))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('num epochs = {}'.format(num_epochs))
    print('learning rate = {}'.format(learning_rate))
    print("batch size = {}".format(batch_size))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    dataset = DenseLibsvmDataset(file, num_features, pos_tag)

    totol_count = dataset.__len__()
    pos_count = 0
    for i in range(totol_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, totol_count))

    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_loss = 0
        cal_time = 0
        sync_time = 0
        for batch_index, (items, labels) in enumerate(train_loader):
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            epoch_loss += loss.data
            loss.backward()

            w_grad = model.linear.weight.grad.data.numpy()
            w_grad_shape = w_grad.shape
            b_grad = model.linear.bias.grad.data.numpy()
            b_grad_shape = b_grad.shape

            w_b_grad = np.concatenate((w_grad.flatten(), b_grad.flatten()))
            cal_time += time.time() - batch_start

            sync_start = time.time()
            postfix = "{}_{}".format(epoch, batch_index)
            w_b_grad_merge = reduce_batch(endpoint, w_b_grad, merged_bucket,
                                          num_workers, worker_index, postfix)
            w_grad_merge = \
                w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]].reshape(w_grad_shape) / float(num_workers)
            b_grad_merge = \
                w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:].reshape(b_grad_shape[0]) / float(num_workers)

            model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge))
            model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge))
            sync_time += time.time() - sync_start

            optimizer.step()

            # print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
            #       'batch cost %.4f s: cal cost %.4f s communication cost %.4f s, '
            #       % (epoch + 1, num_epochs, batch_index, len(train_indices) / batch_size,
            #          time.time() - train_start, loss.data, time.time() - epoch_start,
            #          time.time() - batch_start, cal_time, sync_time))

        # Test the Model
        test_start = time.time()
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
        test_time = time.time() - test_start

        print(
            'Epoch %d has %d batches, time = %.4f, epoch cost %.4f s: '
            'computation cost %.4f s communication cost %.4f s, '
            'train loss = %.4f, test cost %.4f s, accuracy of the model on the %d test samples: %d %%, loss = %f'
            % (epoch, batch_index, time.time() - train_start, time.time() -
               epoch_start, cal_time, sync_time, epoch_loss, test_time,
               len(val_indices), 100 * correct / total, test_loss / total))

    if worker_index == 0:
        clear_bucket(endpoint)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #8
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file']
    host = event['host']
    port = event['port']

    print('bucket = {}'.format(bucket))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print("file = {}".format(key))
    print("host = {}".format(host))
    print("port = {}".format(port))

    # Set thrift connection
    # Make socket
    transport = TSocket.TSocket(host, port)
    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)
    # Wrap in a protocol
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    # Create a client to use the protocol encoder
    t_client = ParameterServer.Client(protocol)
    # Connect!
    transport.open()

    # test thrift connection
    ps_client.ping(t_client)
    print("create and ping thrift server >>> HOST = {}, PORT = {}".format(
        host, port))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))

    # parse dataset
    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, NUM_FEATURES)
    print("parse data cost {} s".format(time.time() - parse_start))

    # preprocess dataset
    preprocess_start = time.time()
    dataset_size = len(dataset)
    indices = list(
        range(dataset_size))  # indices for training and validation splits:
    split = int(np.floor(VALIDATION_RATIO * dataset_size))
    if SHUFFLE_DATASET:
        np.random.seed(RANDOM_SEED)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=BATCH_SIZE,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=BATCH_SIZE,
                                                    sampler=valid_sampler)
    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(NUM_FEATURES, NUM_CLASSES)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

    # register model
    model_name = "w.b"
    weight_shape = model.linear.weight.data.numpy().shape
    weight_length = weight_shape[0] * weight_shape[1]
    bias_shape = model.linear.bias.data.numpy().shape
    bias_length = bias_shape[0]
    model_length = weight_length + bias_length
    ps_client.register_model(t_client, worker_index, model_name, model_length,
                             num_workers)
    ps_client.exist_model(t_client, model_name)
    print("register and check model >>> name = {}, length = {}".format(
        model_name, model_length))

    # Training the Model
    train_start = time.time()
    iter_counter = 0
    for epoch in range(NUM_EPOCHS):
        epoch_start = time.time()
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            batch_start = time.time()

            # pull latest model
            ps_client.can_pull(t_client, model_name, iter_counter,
                               worker_index)
            latest_model = ps_client.pull_model(t_client, model_name,
                                                iter_counter, worker_index)
            model.linear.weight = Parameter(
                torch.from_numpy(
                    np.asarray(latest_model[:weight_length],
                               dtype=np.double).reshape(weight_shape)))
            model.linear.bias = Parameter(
                torch.from_numpy(
                    np.asarray(latest_model[weight_length:],
                               dtype=np.double).reshape(bias_shape[0])))

            items = Variable(items.view(-1, NUM_FEATURES))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items.double())
            loss = criterion(outputs, labels)
            loss.backward()

            # flatten and concat gradients of weight and bias
            w_b_grad = np.concatenate(
                (model.linear.weight.grad.data.numpy().flatten(),
                 model.linear.bias.grad.data.numpy().flatten()))
            cal_time = time.time() - batch_start

            # push gradient to PS
            sync_start = time.time()
            ps_client.can_push(t_client, model_name, iter_counter,
                               worker_index)
            ps_client.push_grad(t_client, model_name, w_b_grad, LEARNING_RATE,
                                iter_counter, worker_index)
            ps_client.can_pull(t_client, model_name, iter_counter + 1,
                               worker_index)  # sync all workers
            sync_time = time.time() - sync_start

            print(
                'Epoch: [%d/%d], Step: [%d/%d] >>> Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s'
                % (epoch + 1, NUM_EPOCHS, batch_index + 1,
                   len(train_indices) / BATCH_SIZE, time.time() - train_start,
                   loss.data, time.time() - epoch_start,
                   time.time() - batch_start, cal_time, sync_time))
            iter_counter += 1

        # Test the Model
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, NUM_FEATURES))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()

        print(
            'Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
            % (time.time() - train_start, len(val_indices),
               100 * correct / total, test_loss))

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context):
    start_time = time.time()
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']

    print('bucket = {}'.format(bucket))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print("file = {}".format(key))
    print('bucket = {}'.format(bucket))
    print('key = {}'.format(key))
    key_splits = key.split("_")
    worker_index = int(key_splits[0])
    num_worker = int(key_splits[1])
    sync_meta = SyncMeta(worker_index, num_worker)
    print("synchronization meta {}".format(sync_meta.__str__()))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s, dataset size = {}"
          .format(time.time() - preprocess_start, dataset_size))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_start = time.time()
    # Training the Model
    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_loss = 0
        for batch_index, (items, labels) in enumerate(train_loader):
            # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            epoch_loss += loss.data
            loss.backward()
            # print("forward and backward cost {} s".format(time.time()-batch_start))
            optimizer.step()

            # print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f s, Loss: %.4f, batch cost %.4f s'
            #        % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size,
            #           time.time() - train_start, loss.data, time.time() - batch_start))

        w = model.linear.weight.data.numpy()
        w_shape = w.shape
        b = model.linear.bias.data.numpy()
        b_shape = b.shape
        # print("weight before sync shape = {}, values = {}".format(w.shape, w))
        # print("bias before sync shape = {}, values = {}".format(b.shape, b))
        w_and_b = np.concatenate((w.flatten(), b.flatten()))
        cal_time = time.time() - epoch_start
        # print("Epoch {} calculation cost = {} s".format(epoch, cal_time))

        sync_start = time.time()
        postfix = str(epoch)
        w_and_b_merge = reduce_scatter_epoch(w_and_b, tmp_bucket, merged_bucket, num_worker, worker_index, postfix)
        w_merge = w_and_b_merge[:w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_worker)
        b_merge = w_and_b_merge[w_shape[0] * w_shape[1]:].reshape(b_shape[0]) / float(num_worker)
        model.linear.weight.data = torch.from_numpy(w_merge)
        model.linear.bias.data = torch.from_numpy(b_merge)
        # print("weight after sync = {}".format(model.linear.weight.data.numpy()[0][:5]))
        # print("bias after sync = {}".format(model.linear.bias.data.numpy()))
        sync_time = time.time() - sync_start
        # print("Epoch {} synchronization cost {} s".format(epoch, sync_time))

        if worker_index == 0:
            delete_expired_merged(merged_bucket, epoch)

        # Test the Model
        test_start = time.time()
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
        test_time = time.time() - test_start

        print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
              'batch cost %.4f s: calculation cost = %.4f s, synchronization cost %.4f s, test cost %.4f s, '
              'accuracy of the model on the %d test samples: %d %%, loss = %f'
              % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size,
                 time.time() - train_start, epoch_loss.data, time.time() - epoch_start,
                 time.time() - batch_start, cal_time, sync_time, test_time,
                 len(val_indices), 100 * correct / total, test_loss / total))

    if worker_index == 0:
        clear_bucket(tmp_bucket)
        clear_bucket(merged_bucket)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #10
0
def handler(event, context):
    startTs = time.time()
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')

    print('bucket = {}'.format(bucket))
    print('key = {}'.format(key))

    key_splits = key.split("_")
    worker_index = int(key_splits[0])
    num_worker = int(key_splits[1])
    sync_meta = SyncMeta(worker_index, num_worker)
    print("synchronization meta {}".format(sync_meta.__str__()))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - startTs))

    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    for epoch in range(num_epochs):
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()

            print("forward and backward cost {} s".format(time.time()-batch_start))

            w_grad = model.linear.weight.grad.data.numpy()
            b_grad = model.linear.bias.grad.data.numpy()
            #print("dtype of grad = {}".format(w_grad.dtype))
            print("w_grad before merge = {}".format(w_grad[0][0:5]))
            print("b_grad before merge = {}".format(b_grad))

            sync_start = time.time()
            put_object(grad_bucket, w_grad_prefix + str(worker_index), w_grad.tobytes())
            put_object(grad_bucket, b_grad_prefix + str(worker_index), b_grad.tobytes())

            file_postfix = "{}_{}".format(epoch, batch_index)
            if worker_index == 0:
                w_grad_merge, b_grad_merge = \
                    merge_w_b_grads(grad_bucket, num_worker, w_grad.dtype,
                                    w_grad.shape, b_grad.shape,
                                    w_grad_prefix, b_grad_prefix)
                put_merged_w_b_grad(model_bucket, w_grad_merge, b_grad_merge,
                                    file_postfix, w_grad_prefix, b_grad_prefix)
                delete_expired_w_b(model_bucket, epoch, batch_index, w_grad_prefix, b_grad_prefix)
                model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge))
                model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge))
            else:
                w_grad_merge, b_grad_merge = get_merged_w_b_grad(model_bucket, file_postfix,
                                                                 w_grad.dtype, w_grad.shape, b_grad.shape,
                                                                 w_grad_prefix, b_grad_prefix)
                model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge))
                model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge))

            print("w_grad after merge = {}".format(model.linear.weight.grad.data.numpy()[0][:5]))
            print("b_grad after merge = {}".format(model.linear.bias.grad.data.numpy()))

            print("synchronization cost {} s".format(time.time() - sync_start))

            optimizer.step()

            print("batch cost {} s".format(time.time() - batch_start))

            if (batch_index + 1) % 10 == 0:
                print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f'
                      % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, loss.data))

    if worker_index == 0:
        clear_bucket(model_bucket)
        clear_bucket(grad_bucket)

    # Test the Model
    correct = 0
    total = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, num_features))
        # items = Variable(items)
        outputs = model(items)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total))

    endTs = time.time()
    print("elapsed time = {} s".format(endTs - startTs))
Beispiel #11
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket']
    key = event['name']
    num_features = event['num_features']
    num_classes = event['num_classes']
    print('bucket = {}'.format(bucket))
    print('key = {}'.format(key))

    key_splits = key.split("_")
    worker_index = int(key_splits[0])
    num_worker = int(key_splits[1])

    # read file(dataset) from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))
    parse_start = time.time()
    dataset = DenseDatasetWithLines(file, num_features)
    preprocess_start = time.time()
    print("libsvm operation cost {}s".format(parse_start - preprocess_start))

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    print("dataset size = {}".format(dataset_size))
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    for epoch in range(num_epochs):
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()
            print("forward and backward cost {} s".format(time.time() -
                                                          batch_start))

            w_grad = model.linear.weight.grad.data.numpy()
            b_grad = model.linear.bias.grad.data.numpy()
            print("w_grad before merge = {}".format(w_grad[0][0:5]))
            print("b_grad before merge = {}".format(b_grad))

            #synchronization starts from that every worker writes their gradients of this batch and epoch
            sync_start = time.time()
            hset_object(endpoint, grad_bucket,
                        w_grad_prefix + str(worker_index), w_grad.tobytes())
            hset_object(endpoint, grad_bucket,
                        b_grad_prefix + str(worker_index), b_grad.tobytes())

            #merge gradients among files
            merge_start = time.time()
            file_postfix = "{}_{}".format(epoch, batch_index)
            if worker_index == 0:
                merge_start = time.time()
                w_grad_merge, b_grad_merge = \
                    merge_w_b_grads(endpoint,
                                    grad_bucket, num_worker, w_grad.dtype,
                                    w_grad.shape, b_grad.shape,
                                    w_grad_prefix, b_grad_prefix)
                print("model average time = {}".format(time.time() -
                                                       merge_start))
                #possible rewrite the file before being accessed. wait until anyone finishes accessing.
                put_merged_w_b_grads(endpoint, model_bucket, w_grad_merge,
                                     b_grad_merge, w_grad_prefix,
                                     b_grad_prefix)
                hset_object(endpoint, model_bucket, "epoch", epoch)
                hset_object(endpoint, model_bucket, "index", batch_index)
                #delete_expired_w_b(endpoint,
                #                   model_bucket, epoch, batch_index, w_grad_prefix, b_grad_prefix)
                model.linear.weight.grad = Variable(
                    torch.from_numpy(w_grad_merge))
                model.linear.bias.grad = Variable(
                    torch.from_numpy(b_grad_merge))
            else:
                # wait for flag to access
                while hget_object(endpoint, model_bucket, "epoch") != None:
                    if int(hget_object(endpoint, model_bucket, "epoch")) == epoch \
                            and int(hget_object(endpoint, model_bucket, "index")) == batch_index:
                        break
                    time.sleep(0.01)
                w_grad_merge, b_grad_merge = get_merged_w_b_grads(
                    endpoint, model_bucket, w_grad.dtype, w_grad.shape,
                    b_grad.shape, w_grad_prefix, b_grad_prefix)
                hcounter(endpoint, model_bucket,
                         "counter")  #flag it if it's accessed.
                print("number of access at this time = {}".format(
                    int(hget_object(endpoint, model_bucket, "counter"))))
                model.linear.weight.grad = Variable(
                    torch.from_numpy(w_grad_merge))
                model.linear.bias.grad = Variable(
                    torch.from_numpy(b_grad_merge))

            print("w_grad after merge = {}".format(
                model.linear.weight.grad.data.numpy()[0][:5]))
            print("b_grad after merge = {}".format(
                model.linear.bias.grad.data.numpy()))

            print("synchronization cost {} s".format(time.time() - sync_start))

            optimizer.step()

            print("batch cost {} s".format(time.time() - batch_start))

            if (batch_index + 1) % 10 == 0:
                print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' %
                      (epoch + 1, num_epochs, batch_index + 1,
                       len(train_indices) / batch_size, loss.data))
    """
    if worker_index == 0:
        while sync_counter(endpoint, bucket, num_workers):
            time.sleep(0.001)
        clear_bucket(endpoint, model_bucket)
        clear_bucket(endpoint, grad_bucket)
    """
    # Test the Model
    correct = 0
    total = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, num_features))
        outputs = model(items)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the %d test samples: %d %%' %
          (len(val_indices), 100 * correct / total))

    end_time = time.time()
    print("elapsed time = {} s".format(end_time - start_time))
Beispiel #12
0
def handler():

    num_features = 30
    num_classes = 2
    file = "/home/ubuntu/code/data/s3.pkl"
    #file = "./ec2/s3.pkl"
    batch_size = 100000
    # read file(dataset) from s3
    parse_start = time.time()
    f = open(file,"rb")
    dataset = pickle.load(f)
    print("processing time = {}".format(time.time()-parse_start))

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    print("dataset size = {}".format(dataset_size))
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    #print("preprocess data cost {} s".format(time.time() - preprocess_start))




    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    testset = [[inputs,targets] for inputs,targets in validation_loader]
    length_test = len(testset)
    test_loss = []
    train_loss = []
    acc = []
    for lr in learning_rate:

        model = LogisticRegression(num_features, num_classes)
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    # Training the Model

        total_time = 0;
        for epoch in range(num_epochs):
            temp_test = []
            temp_train = []
            temp_acc = []
            for batch_index, (items, labels) in enumerate(train_loader):
                #print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
                batch_start = time.time()
                items = Variable(items.view(-1, num_features))
                labels = Variable(labels)

                # Forward + Backward + Optimize
                optimizer.zero_grad()#eliminate the acumulated gradients.
                outputs = model(items)
                loss = criterion(outputs, labels)
                loss.backward()#calculating the gradients.
                optimizer.step()#only after this step will the model be updated.

                #print("weights are of the scale = {}".format(model.linear.weight.data.numpy()))
                #print("gradients are of the scale = {}".format(model.linear.weight.grad.numpy()))
                #print("batch cost {} s".format(time.time() - batch_start))
                total_time += time.time()-batch_start;
                if (batch_index + 1) % 1 == 0:
                    print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f, learning rate: %.4f'
                          % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, loss.data, lr))
                #if (batch_index + 1) % 11 == 0:
            #        #a,_ = test(model,train_loader,criterion)
                #    train_loss.append(loss.data)
                     a,b = test(model,testset[batch_index%length_test],criterion)
                    test_loss.append(a)
                    acc.append(b)
Beispiel #13
0
def handler(event, context):
    startTs = time.time()
    num_features = event['num_features']
    learning_rate = event["learning_rate"]
    batch_size = event["batch_size"]
    num_epochs = event["num_epochs"]
    validation_ratio = event["validation_ratio"]

    # Reading data from S3
    bucket_name = event['bucket_name']
    key = urllib.parse.unquote_plus(event['key'], encoding='utf-8')
    print(f"Reading training data from bucket = {bucket_name}, key = {key}")
    key_splits = key.split("_")
    worker_index = int(key_splits[0])
    num_worker = int(key_splits[1])

    # read file from s3
    file = get_object(bucket_name, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - startTs))

    parse_start = time.time()
    dataset = SparseDatasetWithLines(file, num_features)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    np.random.seed(42)
    np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    train_set = [dataset[i] for i in train_indices]
    val_set = [dataset[i] for i in val_indices]

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    # Set thrift connection
    # Make socket
    transport = TSocket.TSocket(constants.HOST, constants.PORT)
    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)
    # Wrap in a protocol
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    # Create a client to use the protocol encoder
    t_client = ParameterServer.Client(protocol)
    # Connect!
    transport.open()

    # test thrift connection
    ps_client.ping(t_client)
    print("create and ping thrift server >>> HOST = {}, PORT = {}".format(
        constants.HOST, constants.PORT))

    # register model
    model_name = "w.b"
    model_length = num_features + 1
    ps_client.register_model(t_client, worker_index, model_name, model_length,
                             num_worker)
    ps_client.exist_model(t_client, model_name)
    print("register and check model >>> name = {}, length = {}".format(
        model_name, model_length))

    # Training the Model
    train_start = time.time()
    iter_counter = 0
    lr = LogisticRegression(train_set, val_set, num_features, num_epochs,
                            learning_rate, batch_size)
    # Training the Model
    for epoch in range(num_epochs):
        epoch_start = time.time()
        num_batches = math.floor(len(train_set) / batch_size)
        print(f"worker {worker_index} epoch {epoch}")

        for batch_idx in range(num_batches):
            batch_start = time.time()
            # pull latest model
            ps_client.can_pull(t_client, model_name, iter_counter,
                               worker_index)
            latest_model = ps_client.pull_model(t_client, model_name,
                                                iter_counter, worker_index)
            lr.grad = torch.from_numpy(np.asarray(latest_model[:-1])).reshape(
                num_features, 1)
            lr.bias = float(latest_model[-1])

            compute_start = time.time()
            batch_ins, batch_label = lr.next_batch(batch_idx)
            batch_grad = torch.zeros(lr.n_input, 1, requires_grad=False)
            batch_bias = np.float(0)
            train_loss = Loss()
            train_acc = Accuracy()

            for i in range(len(batch_ins)):
                z = lr.forward(batch_ins[i])
                h = lr.sigmoid(z)
                loss = lr.loss(h, batch_label[i])
                # print("z= {}, h= {}, loss = {}".format(z, h, loss))
                train_loss.update(loss, 1)
                train_acc.update(h, batch_label[i])
                g = lr.backward(batch_ins[i], h.item(), batch_label[i])
                batch_grad.add_(g)
                batch_bias += np.sum(h.item() - batch_label[i])
            batch_grad = batch_grad.div(len(batch_ins))
            batch_bias = batch_bias / len(batch_ins)
            batch_grad.mul_(-1.0 * learning_rate)
            lr.grad.add_(batch_grad)
            lr.bias = lr.bias - batch_bias * learning_rate

            np_grad = lr.grad.numpy().flatten()
            w_b_grad = np.append(np_grad, lr.bias)
            compute_end = time.time()

            sync_start = time.time()
            ps_client.can_push(t_client, model_name, iter_counter,
                               worker_index)
            ps_client.push_grad(t_client, model_name, w_b_grad, learning_rate,
                                iter_counter, worker_index)
            ps_client.can_pull(t_client, model_name, iter_counter + 1,
                               worker_index)  # sync all workers
            sync_time = time.time() - sync_start

            print(
                'Epoch: [%d/%d], Step: [%d/%d] >>> Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s'
                % (epoch + 1, NUM_EPOCHS, batch_idx + 1, len(train_indices) /
                   batch_size, time.time() - train_start, train_loss,
                   time.time() - epoch_start, time.time() - batch_start,
                   compute_end - batch_start, sync_time))
            iter_counter += 1

        val_loss, val_acc = lr.evaluate()
        print(f"Validation loss: {val_loss}, validation accuracy: {val_acc}")
        print(f"Epoch takes {time.time() - epoch_start}s")
Beispiel #14
0
def handler(event, context):

    startTs = time.time()
    bucket = event['data_bucket']
    worker_index = event['rank']
    num_worker = event['num_workers']
    key = 'training_{}.pt'.format(worker_index)
    print('data_bucket = {}\n worker_index:{}\n num_worker:{}\n key:{}'.format(
        bucket, worker_index, num_worker, key))
    sync_meta = SyncMeta(worker_index, num_worker)
    print("synchronization meta {}".format(sync_meta.__str__()))

    # read file from s3
    readS3_start = time.time()
    if not os.path.exists(os.path.join(local_dir, processed_folder)):
        os.makedirs(os.path.join(local_dir, processed_folder))

    s3.Bucket(bucket).download_file(
        key, os.path.join(local_dir, processed_folder, training_file))
    s3.Bucket(bucket).download_file(
        test_file, os.path.join(local_dir, processed_folder, test_file))
    print("read data cost {} s".format(time.time() - readS3_start))

    # load dataset
    train_dataset = dsets.MNIST(root=local_dir,
                                train=True,
                                transform=transforms.ToTensor(),
                                download=False)
    test_dataset = dsets.MNIST(root=local_dir,
                               train=False,
                               transform=transforms.ToTensor(),
                               download=False)
    # print('[{}]length of training:{}, length of test:{}'.format(worker_index, len(train_dataset), len(test_dataset)))
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False)

    model = LogisticRegression(num_features, num_classes)
    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # # Training the Model
    step = 0
    for epoch in range(num_epochs):
        for batch_index, (items, labels) in enumerate(train_loader):

            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            batch_start = time.time()
            images = Variable(items.view(-1, 28 * 28))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            print("forward and backward cost {} s".format(time.time() -
                                                          batch_start))

            if sync_mode == 'model_avg':

                # apply local gradient to local model
                optimizer.step()
                # average model
                if (step + 1) % sync_step == 0:

                    w_grad = model.linear.weight.data.numpy()
                    b_grad = model.linear.bias.data.numpy()
                    #print("dtype of grad = {}".format(w_grad.dtype))
                    # print("w_grad before merge = {}".format(w_grad[0][0:5]))
                    # print("b_grad before merge = {}".format(b_grad))

                    sync_start = time.time()
                    put_object(grad_bucket, w_grad_prefix + str(worker_index),
                               w_grad.tobytes())
                    put_object(grad_bucket, b_grad_prefix + str(worker_index),
                               b_grad.tobytes())

                    file_postfix = "{}_{}".format(epoch, batch_index)
                    if worker_index == 0:
                        w_grad_merge, b_grad_merge = \
                            merge_w_b_grads(grad_bucket, num_worker, w_grad.dtype,
                                            w_grad.shape, b_grad.shape,
                                            w_grad_prefix, b_grad_prefix)
                        put_merged_w_b_grad(model_bucket, w_grad_merge,
                                            b_grad_merge, file_postfix,
                                            w_grad_prefix, b_grad_prefix)
                        delete_expired_w_b(model_bucket, epoch, batch_index,
                                           w_grad_prefix, b_grad_prefix)

                        # update the model with averaged model
                        model.linear.weight = torch.nn.Parameter(
                            torch.from_numpy(w_grad_merge).mul_(1 /
                                                                num_worker))
                        model.linear.bias = torch.nn.Parameter(
                            torch.from_numpy(b_grad_merge).mul_(1 /
                                                                num_worker))
                    else:
                        w_grad_merge, b_grad_merge = get_merged_w_b_grad(
                            model_bucket, file_postfix, w_grad.dtype,
                            w_grad.shape, b_grad.shape, w_grad_prefix,
                            b_grad_prefix)
                        # update the model with averaged model
                        model.linear.weight = torch.nn.Parameter(
                            torch.from_numpy(w_grad_merge).mul_(1 /
                                                                num_worker))
                        model.linear.bias = torch.nn.Parameter(
                            torch.from_numpy(b_grad_merge).mul_(1 /
                                                                num_worker))

                    print("synchronization cost {} s".format(time.time() -
                                                             sync_start))
                    print("batch cost {} s".format(time.time() - batch_start))

                    # print("w_grad after merge = {}".format(model.linear.weight.data.numpy()[0][:5]))
                    # print("b_grad after merge = {}".format(model.linear.bias.data.numpy()))

            if sync_mode == 'gradient_avg':
                w_grad = model.linear.weight.grad.data.numpy()
                b_grad = model.linear.bias.grad.data.numpy()
                #print("dtype of grad = {}".format(w_grad.dtype))
                # print("w_grad before merge = {}".format(w_grad[0][0:5]))
                # print("b_grad before merge = {}".format(b_grad))

                sync_start = time.time()
                put_object(grad_bucket, w_grad_prefix + str(worker_index),
                           w_grad.tobytes())
                put_object(grad_bucket, b_grad_prefix + str(worker_index),
                           b_grad.tobytes())

                file_postfix = "{}_{}".format(epoch, batch_index)
                if worker_index == 0:
                    w_grad_merge, b_grad_merge = \
                        merge_w_b_grads(grad_bucket, num_worker, w_grad.dtype,
                                        w_grad.shape, b_grad.shape,
                                        w_grad_prefix, b_grad_prefix)
                    put_merged_w_b_grad(model_bucket, w_grad_merge,
                                        b_grad_merge, file_postfix,
                                        w_grad_prefix, b_grad_prefix)
                    delete_expired_w_b(model_bucket, epoch, batch_index,
                                       w_grad_prefix, b_grad_prefix)
                    model.linear.weight.grad = Variable(
                        torch.from_numpy(w_grad_merge))
                    model.linear.bias.grad = Variable(
                        torch.from_numpy(b_grad_merge))
                else:
                    w_grad_merge, b_grad_merge = get_merged_w_b_grad(
                        model_bucket, file_postfix, w_grad.dtype, w_grad.shape,
                        b_grad.shape, w_grad_prefix, b_grad_prefix)
                    model.linear.weight.grad = Variable(
                        torch.from_numpy(w_grad_merge))
                    model.linear.bias.grad = Variable(
                        torch.from_numpy(b_grad_merge))

                print("synchronization cost {} s".format(time.time() -
                                                         sync_start))
                # print("w_grad after merge = {}".format(model.linear.weight.grad.data.numpy()[0][:5]))
                # print("b_grad after merge = {}".format(model.linear.bias.grad.data.numpy()))

                optimizer.step()

                print("batch cost {} s".format(time.time() - batch_start))

            if (batch_index + 1) % 10 == 0:
                print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' %
                      (epoch + 1, num_epochs, batch_index + 1,
                       len(train_dataset) / batch_size, loss.data))

            step += 1

        # Test the Model
        correct = 0
        total = 0
        for items, labels in test_loader:
            # items = Variable(items.view(-1, num_features))
            # items = Variable(items)

            images = Variable(items.view(-1, 28 * 28))
            labels = Variable(labels)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()

        print('Accuracy of the model on the %d test samples: %d %%' %
              (len(test_dataset), 100 * correct / total))

    if worker_index == 0:
        clear_bucket(model_bucket)
        clear_bucket(grad_bucket)

    # Test the Model
    correct = 0
    total = 0
    for items, labels in test_loader:
        # items = Variable(items.view(-1, num_features))
        # items = Variable(items)

        images = Variable(items.view(-1, 28 * 28))
        labels = Variable(labels)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the %d test samples: %d %%' %
          (len(test_dataset), 100 * correct / total))

    endTs = time.time()
    print("elapsed time = {} s".format(endTs - startTs))