def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) s3 = boto3.client('s3') # read file from s3 s3.download_file(bucket, feature_file_name, local_dir + str(feature_file_name)) features_matrix = np.load(local_dir + str(feature_file_name)) print("read features matrix cost {} s".format(time.time() - start_time)) print("feature matrix shape = {}, dtype = {}".format( features_matrix.shape, features_matrix.dtype)) print("feature matrix sample = {}".format(features_matrix[0])) row_features = features_matrix.shape[0] col_features = features_matrix.shape[1] s3.download_file(bucket, label_file_name, local_dir + str(label_file_name)) labels_matrix = np.load(local_dir + str(label_file_name)) print("read label matrix cost {} s".format(time.time() - start_time)) print("label matrix shape = {}, dtype = {}".format(labels_matrix.shape, labels_matrix.dtype)) print("label matrix sample = {}".format(labels_matrix[0:10])) row_labels = labels_matrix.shape[0] if row_features != row_labels: raise AssertionError( "row of feature matrix is {}, but row of label matrix is {}.". format(row_features, row_labels)) features_matrix = features_matrix.flatten() samples_per_file = row_features / n_files for i in range(n_files): start_row = i * samples_per_file end_row = (i + 1) * samples_per_file features_file_name = "features_{}_{}".format(i, n_files) labels_file_name = "labels_{}_{}".format(i, n_files) put_object( bucket, features_file_name, features_matrix[start_row * col_features:end_row * col_features].tobytes()) put_object(bucket, labels_file_name, labels_matrix[start_row:end_row].tobytes()) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num classes = {}'.format(num_classes)) print('num features = {}'.format(num_features)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) s3 = boto3.client('s3') feature_file_name = "features_{}_{}.npy".format(worker_index, num_workers) label_file_name = "labels_{}_{}.npy".format(worker_index, num_workers) # read file from s3 s3.download_file(bucket, feature_file_name, local_dir + str(feature_file_name)) features_matrix = np.load(local_dir + str(feature_file_name)) print("read features matrix cost {} s".format(time.time() - start_time)) print("feature matrix shape = {}, dtype = {}".format(features_matrix.shape, features_matrix.dtype)) print("feature matrix sample = {}".format(features_matrix[0])) row_features = features_matrix.shape[0] col_features = features_matrix.shape[1] s3.download_file(bucket, label_file_name, local_dir + str(label_file_name)) labels_matrix = np.load(local_dir + str(label_file_name)) print("read label matrix cost {} s".format(time.time() - start_time)) print("label matrix shape = {}, dtype = {}".format(labels_matrix.shape, labels_matrix.dtype)) print("label matrix sample = {}".format(labels_matrix[0:10])) row_labels = labels_matrix.shape[0] if row_features != row_labels: raise AssertionError("row of feature matrix is {}, but row of label matrix is {}." .format(row_features, row_labels)) parse_start = time.time() dataset = DenseDatasetWithNP(col_features, features_matrix, labels_matrix) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}" .format(time.time() - preprocess_start, dataset_size)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) loss.backward() # print("forward and backward cost {} s".format(time.time() - batch_start)) w_grad = model.linear.weight.grad.data.numpy() w_grad_shape = w_grad.shape b_grad = model.linear.bias.grad.data.numpy() b_grad_shape = b_grad.shape w_b_grad = np.concatenate((w_grad.flatten(), b_grad.flatten())) cal_time = time.time() - batch_start sync_start = time.time() postfix = "{}_{}".format(epoch, batch_index) w_b_grad_merge = reduce_batch(w_b_grad, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) w_grad_merge = \ w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]].reshape(w_grad_shape) / float(num_workers) b_grad_merge = \ w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:].reshape(b_grad_shape[0]) / float(num_workers) model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge)) sync_time = time.time() - sync_start optimizer.step() # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'batch cost %.4f s: cal cost %.4f s communication cost %.4f s test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, time.time() - train_start, loss.data, time.time() - epoch_start, time.time() - batch_start, cal_time, sync_time, test_time, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: delete_expired_merged_batch(merged_bucket, epoch, batch_index) # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Epoch: %d, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch, time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'].split(",") tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] pos_tag = event['pos_tag'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('tmp bucket = {}'.format(tmp_bucket)) print('merge bucket = {}'.format(merged_bucket)) print('num epochs = {}'.format(num_epochs)) print('num classes = {}'.format(num_classes)) print('num features = {}'.format(num_features)) print('positive tag = {}'.format(pos_tag)) print('learning rate = {}'.format(learning_rate)) print("batch_size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key[0]).read().decode('utf-8').split("\n") dataset = DenseLibsvmDataset(file, num_features, pos_tag) if len(key) > 1: for more_key in key[1:]: file = get_object(bucket, more_key).read().decode('utf-8').split("\n") dataset.add_more(file) print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}" .format(time.time() - preprocess_start, dataset_size)) model = SVM(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels).float() # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = torch.mean(torch.clamp(1 - outputs.t() * labels, min=0)) # hinge loss loss += 0.01 * torch.mean(model.linear.weight ** 2) / 2.0 # l2 penalty epoch_loss += loss loss.backward() optimizer.step() w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) cal_time = time.time() - epoch_start sync_start = time.time() postfix = "{}".format(epoch) u_w_b_merge = reduce_epoch(w_and_b, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) w_mean = u_w_b_merge[: w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[w_shape[0] * w_shape[1]:].reshape(b_shape[0]) / float(num_workers) model.linear.weight.data = torch.from_numpy(w_mean) model.linear.bias.data = torch.from_numpy(b_mean) sync_time = time.time() - sync_start # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss = torch.mean(torch.clamp(1 - outputs.t() * labels.float(), min=0)) # hinge loss _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print('Epoch: [%d/%d] has %d batches, Time: %.4f, Loss: %.4f, ' 'epoch cost %.4f: computation cost %.4f s communication cost %.4f s test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, num_epochs, batch_index, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, cal_time, sync_time, test_time, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: delete_expired_merged_epoch(merged_bucket, epoch) # Test the Model correct = 0 total = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss = torch.mean(torch.clamp(1 - outputs.t() * labels.float(), min=0)) # hinge loss _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] pos_tag = event['pos_tag'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = DenseLibsvmDataset(file, num_features, pos_tag) totol_count = dataset.__len__() pos_count = 0 for i in range(totol_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, totol_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s".format(time.time() - preprocess_start)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 cal_time = 0 sync_time = 0 for batch_index, (items, labels) in enumerate(train_loader): batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) epoch_loss += loss.data loss.backward() w_grad = model.linear.weight.grad.data.numpy() w_grad_shape = w_grad.shape b_grad = model.linear.bias.grad.data.numpy() b_grad_shape = b_grad.shape w_b_grad = np.concatenate((w_grad.flatten(), b_grad.flatten())) cal_time += time.time() - batch_start sync_start = time.time() postfix = "{}_{}".format(epoch, batch_index) w_b_grad_merge = reduce_batch(w_b_grad, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) w_grad_merge = \ w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]].reshape(w_grad_shape) / float(num_workers) b_grad_merge = \ w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:].reshape(b_grad_shape[0]) / float(num_workers) model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge)) sync_time += time.time() - sync_start optimizer.step() # print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' # 'batch cost %.4f s: cal cost %.4f s communication cost %.4f s, ' # % (epoch + 1, num_epochs, batch_index, len(train_indices) / batch_size, # time.time() - train_start, loss.data, time.time() - epoch_start, # time.time() - batch_start, cal_time, sync_time)) if worker_index == 0: delete_expired_merged_batch(merged_bucket, epoch, batch_index) # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'Epoch %d has %d batches, time = %.4f, epoch cost %.4f s: ' 'computation cost %.4f s communication cost %.4f s, ' 'train loss = %.4f, test cost %.4f s, accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch, batch_index, time.time() - train_start, time.time() - epoch_start, cal_time, sync_time, epoch_loss, test_time, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'].split(",") tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] pos_tag = event['pos_tag'] num_epochs = event['num_epochs'] num_admm_epochs = event['num_admm_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] lam = event['lambda'] rho = event['rho'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('tmp bucket = {}'.format(tmp_bucket)) print('merge bucket = {}'.format(merged_bucket)) print('num epochs = {}'.format(num_epochs)) print('num admm epochs = {}'.format(num_admm_epochs)) print('num classes = {}'.format(num_classes)) print('num features = {}'.format(num_features)) print('positive tag = {}'.format(pos_tag)) print('learning rate = {}'.format(learning_rate)) print("batch_size = {}".format(batch_size)) print("lambda = {}".format(lam)) print("rho = {}".format(rho)) # read file from s3 file = get_object(bucket, key[0]).read().decode('utf-8').split("\n") dataset = DenseLibsvmDataset(file, num_features, pos_tag) if len(key) > 1: for more_key in key[1:]: file = get_object(bucket, more_key).read().decode('utf-8').split("\n") dataset.add_more(file) print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = LogisticRegression(num_features, num_classes).double() print("size of w = {}".format(model.linear.weight.data.size())) z, u = initialize_z_and_u(model.linear.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() stop = False for admm_epoch in range(num_admm_epochs): admm_epoch_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items.double()) classify_loss = criterion(outputs, labels) epoch_loss += classify_loss.data u_z = torch.from_numpy(u).double() - torch.from_numpy( z).double() loss = classify_loss for name, param in model.named_parameters(): if name.split('.')[-1] == "weight": loss += rho / 2.0 * torch.norm(param + u_z, p=2) #loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z)) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'ADMM Epoch: [%d/%d], Epoch: [%d/%d], Batch [%d], ' 'Time: %.4f, Loss: %.4f, epoch cost %.4f, test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (admm_epoch, num_admm_epochs, epoch, num_epochs, batch_index, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, test_time, len(val_indices), 100 * correct / total, test_loss / total)) w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape u_shape = u.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_and_b.flatten())) cal_time = time.time() - admm_epoch_start sync_start = time.time() postfix = "{}".format(admm_epoch) u_w_b_merge = reduce_epoch(u_w_b, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(num_workers) w_mean = u_w_b_merge[u_shape[0] * u_shape[1]:u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:].reshape( b_shape[0]) / float(num_workers) #model.linear.weight.data = torch.from_numpy(w) model.linear.bias.data = torch.from_numpy(b_mean) sync_time = time.time() - sync_start if worker_index == 0: delete_expired_merged_epoch(merged_bucket, admm_epoch) #z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam) #stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho) #print("stop = {}".format(stop)) #z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, num_workers, lam) #print(z) u = u + model.linear.weight.data.numpy() - z #print(u) # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'ADMM Epoch: [%d/%d], Time: %.4f, Loss: %.4f, ' 'ADMM epoch cost %.4f: computation cost %.4f s communication cost %.4f s test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (admm_epoch, num_admm_epochs, time.time() - train_start, epoch_loss.data, time.time() - admm_epoch_start, cal_time, sync_time, test_time, len(val_indices), 100 * correct / total, test_loss / total)) # Test the Model correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print( 'Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_epochs = event['num_epochs'] num_admm_epochs = event['num_admm_epochs'] learning_rate = event['learning_rate'] lam = event['lambda'] rho = event['rho'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('tmp bucket = {}'.format(tmp_bucket)) print('merge bucket = {}'.format(merged_bucket)) print('num epochs = {}'.format(num_epochs)) print('num admm epochs = {}'.format(num_admm_epochs)) print('learning rate = {}'.format(learning_rate)) print("lambda = {}".format(lam)) print("rho = {}".format(rho)) print("batch_size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) # file_path = "../../dataset/agaricus_127d_train.libsvm" # file = open(file_path).readlines() parse_start = time.time() dataset = DenseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = SVM(num_features, num_classes).float() print("size of w = {}".format(model.linear.weight.data.size())) z, u = initialize_z_and_u(model.linear.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() stop = False for admm_epoch in range(num_admm_epochs): print("ADMM Epoch >>> {}".format(admm_epoch)) for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) classify_loss = torch.mean( torch.clamp(1 - outputs.t() * labels.float(), min=0)) # hinge loss epoch_loss += classify_loss u_z = torch.from_numpy(u).float() - torch.from_numpy(z).float() loss = classify_loss for name, param in model.named_parameters(): if name.split('.')[-1] == "weight": loss += rho / 2.0 * torch.norm(param + u_z, p=2) #loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z)) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() train_time = time.time() - epoch_start # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += torch.mean( torch.clamp(1 - outputs.t() * labels.float(), min=0)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'train cost %.4f s, test cost %.4f s: ' 'accuracy of the model on the %d test samples: %d %%, test loss = %f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, train_time, test_time, len(val_indices), 100 * correct / total, test_loss / total)) w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape u_shape = u.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_and_b.flatten())) cal_time = time.time() - epoch_start print("Epoch {} calculation cost = {} s".format(epoch, cal_time)) sync_start = time.time() postfix = "{}".format(admm_epoch) u_w_b_merge = reduce_epoch(u_w_b, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(num_workers) w_mean = u_w_b_merge[u_shape[0] * u_shape[1]:u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:].reshape( b_shape[0]) / float(num_workers) #model.linear.weight.data = torch.from_numpy(w) model.linear.bias.data = torch.from_numpy(b_mean).float() sync_time = time.time() - sync_start print("Epoch {} synchronization cost {} s".format(epoch, sync_time)) if worker_index == 0: delete_expired_merged_epoch(merged_bucket, admm_epoch) #z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam) #stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho) #print("stop = {}".format(stop)) #z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, num_workers, lam) #print(z) u = u + model.linear.weight.data.numpy() - z #print(u) # Test the Model correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += torch.mean( torch.clamp(1 - outputs.t() * labels.float(), min=0)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print( 'Epoch: %d, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch, time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): try: start_time = time.time() bucket_name = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_features = event['num_features'] learning_rate = event["learning_rate"] batch_size = event["batch_size"] num_epochs = event["num_epochs"] validation_ratio = event["validation_ratio"] # read file from s3 file = get_object(bucket_name, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = SparseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_set = [dataset[i] for i in train_indices] val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s".format(time.time() - preprocess_start)) lr = LogisticRegression(train_set, val_set, num_features, num_epochs, learning_rate, batch_size) # Training the Model train_start = time.time() epoch_counter = 0 for epoch in range(num_epochs): epoch_start = time.time() num_batches = math.floor(len(train_set) / batch_size) train_loss = Loss() train_acc = Accuracy() for batch_idx in range(num_batches): batch_ins, batch_label = lr.next_batch(batch_idx) batch_grad = torch.zeros(lr.n_input, 1, requires_grad=False) batch_bias = np.float(0) for i in range(len(batch_ins)): z = lr.forward(batch_ins[i]) h = lr.sigmoid(z) loss = lr.loss(h, batch_label[i]) train_loss.update(loss, 1) train_acc.update(h, batch_label[i]) g = lr.backward(batch_ins[i], h.item(), batch_label[i]) batch_grad.add_(g) batch_bias += np.sum(h.item() - batch_label[i]) batch_grad = batch_grad.div(len(batch_ins)) batch_bias = batch_bias / len(batch_ins) batch_grad.mul_(-1.0 * learning_rate) lr.grad.add_(batch_grad) lr.bias = lr.bias - batch_bias * learning_rate cal_time = time.time() - epoch_start epoch_counter += 1 sync_start = time.time() np_grad = lr.grad.numpy().flatten() np_bias = np.array(lr.bias, dtype=np_grad.dtype) w_and_b = np.concatenate((np_grad, np_bias)) postfix = "{}".format(epoch) w_b_merge = reduce_epoch(w_and_b, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) lr.grad, lr.bias = w_b_merge[:-1].reshape( num_features, 1) / float(num_workers), float( w_b_merge[-1]) / float(num_workers) sync_time = time.time() - sync_start test_start = time.time() val_loss, val_acc = lr.evaluate() test_time = time.time() - test_start print( 'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %s, Accuracy: %s, epoch cost %.4f, ' 'cal cost %.4f s, sync cost %.4f s, test cost %.4f s, ' 'test accuracy: %s %%, test loss: %s' % (epoch + 1, num_epochs, batch_idx + 1, num_batches, time.time() - train_start, train_loss, train_acc, time.time() - epoch_start, cal_time, sync_time, test_time, val_acc, val_loss)) if worker_index == 0: clear_bucket(tmp_bucket) clear_bucket(merged_bucket) print("Elapsed time = {} s".format(time.time() - start_time)) except Exception as e: print("Error {}".format(e))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = DenseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) epoch_loss += loss.data loss.backward() optimizer.step() # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'batch cost %.4f s: test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, time.time() - batch_start, test_time, len(val_indices), 100 * correct / total, test_loss / total)) w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) cal_time = time.time() - epoch_start print("Epoch {} calculation cost = {} s".format(epoch, cal_time)) sync_start = time.time() postfix = "{}".format(epoch) u_w_b_merge = reduce_epoch(w_and_b, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) w_mean = u_w_b_merge[:w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[w_shape[0] * w_shape[1]:].reshape( b_shape[0]) / float(num_workers) model.linear.weight.data = torch.from_numpy(w_mean) model.linear.bias.data = torch.from_numpy(b_mean) sync_time = time.time() - sync_start print("Epoch {} synchronization cost {} s".format(epoch, sync_time)) if worker_index == 0: delete_expired_merged_epoch(merged_bucket, epoch) # # # #file_postfix = "{}_{}".format(epoch, worker_index) # if epoch < num_epochs - 1: # if worker_index == 0: # w_merge, b_merge = merge_w_b(model_bucket, num_workers, w.dtype, # w.shape, b.shape, tmp_w_prefix, tmp_b_prefix) # put_merged_w_b(model_bucket, w_merge, b_merge, # str(epoch), w_prefix, b_prefix) # delete_expired_w_b_by_epoch(model_bucket, epoch, tmp_w_prefix, tmp_b_prefix) # model.linear.weight.data = torch.from_numpy(w_merge) # model.linear.bias.data = torch.from_numpy(b_merge) # else: # w_merge, b_merge = get_merged_w_b(model_bucket, str(epoch), w.dtype, # w.shape, b.shape, w_prefix, b_prefix) # model.linear.weight.data = torch.from_numpy(w_merge) # model.linear.bias.data = torch.from_numpy(b_merge) #print("weight after sync = {}".format(model.linear.weight.data.numpy()[0][:5])) #print("bias after sync = {}".format(model.linear.bias.data.numpy())) # print("epoch {} synchronization cost {} s".format(epoch, time.time() - sync_start)) # Test the Model correct = 0 total = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) # items = Variable(items) outputs = model(items) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] num_epochs = event['num_epochs'] num_admm_epochs = event['num_admm_epochs'] learning_rate = event['learning_rate'] lam = event['lambda'] rho = event['rho'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) s3 = boto3.client('s3') feature_file_name = "features_{}_{}.npy".format(worker_index, num_workers) label_file_name = "labels_{}_{}.npy".format(worker_index, num_workers) # read file from s3 s3.download_file(bucket, feature_file_name, local_dir + str(feature_file_name)) features_matrix = np.load(local_dir + str(feature_file_name)) print("read features matrix cost {} s".format(time.time() - start_time)) print("feature matrix shape = {}, dtype = {}".format( features_matrix.shape, features_matrix.dtype)) print("feature matrix sample = {}".format(features_matrix[0])) row_features = features_matrix.shape[0] col_features = features_matrix.shape[1] s3.download_file(bucket, label_file_name, local_dir + str(label_file_name)) labels_matrix = np.load(local_dir + str(label_file_name)) print("read label matrix cost {} s".format(time.time() - start_time)) print("label matrix shape = {}, dtype = {}".format(labels_matrix.shape, labels_matrix.dtype)) print("label matrix sample = {}".format(labels_matrix[0:10])) row_labels = labels_matrix.shape[0] if row_features != row_labels: raise AssertionError( "row of feature matrix is {}, but row of label matrix is {}.". format(row_features, row_labels)) parse_start = time.time() dataset = DenseDatasetWithNP(col_features, features_matrix, labels_matrix) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = LogisticRegression(num_features, num_classes).double() print("size of w = {}".format(model.linear.weight.data.size())) z, u = initialize_z_and_u(model.linear.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() stop = False for admm_epoch in range(num_admm_epochs): print("ADMM Epoch >>> {}".format(admm_epoch)) for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items.double()) classify_loss = criterion(outputs, labels) epoch_loss += classify_loss.data u_z = torch.from_numpy(u).double() - torch.from_numpy( z).double() loss = classify_loss for name, param in model.named_parameters(): if name.split('.')[-1] == "weight": loss += rho / 2.0 * torch.norm(param + u_z, p=2) # loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z)) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'batch cost %.4f s: test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, time.time() - batch_start, test_time, len(val_indices), 100 * correct / total, test_loss / total)) w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape u_shape = u.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_and_b.flatten())) cal_time = time.time() - epoch_start print("Epoch {} calculation cost = {} s".format(epoch, cal_time)) sync_start = time.time() postfix = "{}".format(admm_epoch) u_w_b_merge = reduce_epoch(u_w_b, tmp_bucket, merged_bucket, num_workers, worker_index, postfix) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(num_workers) w_mean = u_w_b_merge[u_shape[0] * u_shape[1]:u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:].reshape( b_shape[0]) / float(num_workers) # model.linear.weight.data = torch.from_numpy(w) model.linear.bias.data = torch.from_numpy(b_mean) sync_time = time.time() - sync_start print("Epoch {} synchronization cost {} s".format(epoch, sync_time)) if worker_index == 0: delete_expired_merged_epoch(merged_bucket, admm_epoch) # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, num_workers, lam) # print(z) u = u + model.linear.weight.data.numpy() - z # print(u) # Test the Model correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print( 'Epoch: %d, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch, time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))