def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] assert dataset_type == "sparse_libsvm" n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Sparse_Linear_Models assert optim.lower() in Optimization.All assert sync_mode.lower() in Synchronization.All # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] shuffle_dataset = True random_seed = 100 print('bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) storage = S3Storage() communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index) # Read file from s3 read_start = time.time() lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # split train set and test set train_set = [dataset[i] for i in train_indices] n_train_batch = math.floor(len(train_set) / batch_size) val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = linear_models.get_sparse_model(model_name, train_set, val_set, n_features, n_epochs, learning_rate, batch_size) train_start = time.time() # Training the Model for epoch in range(n_epochs): epoch_start = time.time() epoch_cal_time = 0 epoch_comm_time = 0 epoch_loss = 0. for batch_idx in range(n_train_batch): batch_start = time.time() batch_loss, batch_acc = model.one_batch() epoch_loss += batch_loss.average if optim == "grad_avg": if sync_mode == "reduce" or sync_mode == "reduce_scatter": w_b = np.concatenate((model.weight.numpy().flatten(), np.array([model.bias], dtype=np.float32))) batch_cal_time = time.time() - batch_start epoch_cal_time += batch_cal_time batch_comm_start = time.time() postfix = "{}_{}".format(epoch, batch_idx) if sync_mode == "reduce": w_b_merge = communicator.reduce_batch(w_b, postfix) elif sync_mode == "reduce_scatter": w_b_merge = communicator.reduce_scatter_batch( w_b, postfix) w_merge = w_b_merge[:n_features] / float(n_workers) b_merge = w_b_merge[-1] / float(n_workers) model.weight = torch.from_numpy(w_merge).reshape( n_features, 1) model.bias = float(b_merge) batch_comm_time = time.time() - batch_comm_start print("one {} round cost {} s".format( sync_mode, batch_comm_time)) epoch_comm_time += batch_comm_time elif sync_mode == "async": w_b = np.concatenate((model.weight.numpy().flatten(), np.array([model.bias], dtype=np.float32))) batch_cal_time = time.time() - batch_start epoch_cal_time += batch_cal_time batch_comm_start = time.time() # init model if worker_index == 0 and epoch == 0 and batch_idx == 0: storage.save(w_b.tobytes(), Prefix.w_b_prefix, merged_bucket) w_b_merge = communicator.async_reduce( w_b, Prefix.w_b_prefix) # async des not need average w_merge = w_b_merge[:n_features] b_merge = w_b_merge[-1] model.weight = torch.from_numpy(w_merge).reshape( n_features, 1) model.bias = float(b_merge) batch_comm_time = time.time() - batch_comm_start print("one {} round cost {} s".format( sync_mode, batch_comm_time)) epoch_comm_time += batch_comm_time if batch_idx % 10 == 0: print( 'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f s, Loss: %.4f, Accuracy: %.4f, batch cost %.4f s' % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, batch_loss.average, batch_acc.accuracy, time.time() - batch_start)) if optim == "model_avg": w_b = np.concatenate((model.weight.numpy().flatten(), np.array([model.bias], dtype=np.float32))) epoch_cal_time += time.time() - epoch_start epoch_sync_start = time.time() postfix = str(epoch) if sync_mode == "reduce": w_b_merge = communicator.reduce_epoch(w_b, postfix) elif sync_mode == "reduce_scatter": w_b_merge = communicator.reduce_scatter_epoch(w_b, postfix) elif sync_mode == "async": if worker_index == 0 and epoch == 0: storage.save(w_b.tobytes(), Prefix.w_b_prefix, merged_bucket) w_b_merge = communicator.async_reduce(w_b, Prefix.w_b_prefix) w_merge = w_b_merge[:n_features] b_merge = w_b_merge[-1] # async des not need average if sync_mode == "reduce" or sync_mode == "reduce_scatter": w_merge = w_merge / float(n_workers) b_merge = b_merge / float(n_workers) model.weight = torch.from_numpy(w_merge).reshape(n_features, 1) model.bias = float(b_merge) print("one {} round cost {} s".format( sync_mode, time.time() - epoch_sync_start)) epoch_comm_time += time.time() - epoch_sync_start if worker_index == 0: delete_start = time.time() # model avg delete by epoch if optim == "model_avg" and sync_mode != "async": communicator.delete_expired_epoch(epoch) elif optim == "grad_avg" and sync_mode != "async": communicator.delete_expired_batch(epoch, batch_idx) epoch_comm_time += time.time() - delete_start # Test the Model test_start = time.time() test_loss, test_acc = model.evaluate() test_time = time.time() - test_start print( "Epoch: [{}/{}] finishes, Batch: [{}/{}], Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, " "calculation cost = {:.4f} s, synchronization cost {:.4f} s, test cost {:.4f} s, " "accuracy of the model on the {} test samples: {}, loss = {}". format(epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_comm_time, test_time, len(val_set), test_acc.accuracy, test_loss.average)) if worker_index == 0: storage.clear(tmp_bucket) storage.clear(merged_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Sparse_Linear_Models assert optim.lower() == Optimization.ADMM assert sync_mode.lower() in [Synchronization.Reduce, Synchronization.Reduce_Scatter] # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] n_admm_epochs = event['n_admm_epochs'] lam = event['lambda'] rho = event['rho'] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) storage = S3Storage() communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index) # Read file from s3 read_start = time.time() lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) shuffle_dataset = True random_seed = 100 if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # split train set and test set train_set = [dataset[i] for i in train_indices] n_train_batch = math.floor(len(train_set) / batch_size) val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s, dataset size = {}" .format(time.time() - preprocess_start, dataset_size)) model = linear_models.get_sparse_model(model_name, train_set, val_set, n_features, n_epochs, learning_rate, batch_size) z, u = initialize_z_and_u(model.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Training the Model train_start = time.time() for admm_epoch in range(n_admm_epochs): print(">>> ADMM Epoch[{}]".format(admm_epoch + 1)) admm_epoch_start = time.time() admm_epoch_cal_time = 0 admm_epoch_comm_time = 0 admm_epoch_test_time = 0 for epoch in range(n_epochs): epoch_start = time.time() epoch_loss = 0. for batch_idx in range(n_train_batch): batch_start = time.time() batch_loss, batch_acc = model.one_batch() u_z = torch.from_numpy(u) - torch.from_numpy(z) new_grad = torch.add(model.weight, u_z).mul(rho) new_grad.mul_(-1.0 * learning_rate) model.weight.add_(new_grad) batch_loss = batch_loss.average + rho / 2.0 * torch.norm(model.weight + u_z, p=2).item() epoch_loss += batch_loss if batch_idx % 10 == 0: print("ADMM Epoch: [{}/{}], Epoch: [{}/{}], Batch: [{}/{}], " "time: {:.4f} s, batch cost {:.4f} s, loss: {}, accuracy: {}" .format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, time.time() - batch_start, batch_loss, batch_acc)) epoch_cal_time = time.time() - epoch_start admm_epoch_cal_time += epoch_cal_time # Test the Model test_start = time.time() test_loss, test_acc = model.evaluate() epoch_test_time = time.time() - test_start admm_epoch_test_time += epoch_test_time print("ADMM Epoch: [{}/{}] Epoch: [{}/{}] finishes, Batch: [{}/{}], " "Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, " "calculation cost = {:.4f} s, test cost {:.4f} s, " "accuracy of the model on the {} test samples: {}, loss = {}" .format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_test_time, len(val_set), test_acc, test_loss)) sync_start = time.time() w = model.weight.numpy() w_shape = w.shape b = np.array([model.bias], dtype=np.float32) b_shape = b.shape u_shape = u.shape w_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_b.flatten())) postfix = "{}".format(admm_epoch) # admm does not support async if sync_mode == "reduce": u_w_b_merge = communicator.reduce_epoch(u_w_b, postfix) elif sync_mode == "reduce_scatter": u_w_b_merge = communicator.reduce_scatter_epoch(u_w_b, postfix) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(n_workers) w_mean = u_w_b_merge[u_shape[0] * u_shape[1]: u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]]\ .reshape(w_shape) / float(n_workers) b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:]\ .reshape(b_shape[0]) / float(n_workers) model.weight = torch.from_numpy(w_mean) model.bias = torch.from_numpy(b_mean) admm_epoch_comm_time += time.time() - sync_start print("one {} round cost {} s".format(sync_mode, admm_epoch_comm_time)) if worker_index == 0: delete_start = time.time() communicator.delete_expired_epoch(admm_epoch) admm_epoch_comm_time += time.time() - delete_start # z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam) # stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho) # print("stop = {}".format(stop)) # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, n_workers, lam) u = u + model.weight.data.numpy() - z print("ADMM Epoch[{}] finishes, cost {} s, cal cost {} s, comm cost {} s, test cost {} s" .format(admm_epoch, time.time() - admm_epoch_start, admm_epoch_cal_time, admm_epoch_comm_time, admm_epoch_test_time)) # Test the Model test_loss, test_acc = model.evaluate() print("Train finish, cost {} s, accuracy of the model on the {} test samples = {}, loss = {}" .format(time.time() - train_start, len(val_set), test_acc, test_loss)) if worker_index == 0: storage.clear(tmp_bucket) storage.clear(merged_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] assert dataset_type == "sparse_libsvm" n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] # ps setting host = event['host'] port = event['port'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Sparse_Linear_Models assert optim.lower() == Optimization.Grad_Avg assert sync_mode.lower() == Synchronization.Reduce # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] print('bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('host = {}'.format(host)) print('port = {}'.format(port)) # Set thrift connection # Make socket transport = TSocket.TSocket(host, port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TBinaryProtocol.TBinaryProtocol(transport) # Create a client to use the protocol encoder t_client = ParameterServer.Client(protocol) # Connect! transport.open() # test thrift connection ps_client.ping(t_client) print("create and ping thrift server >>> HOST = {}, PORT = {}".format( host, port)) # Read file from s3 read_start = time.time() storage = S3Storage() lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) shuffle_dataset = True random_seed = 100 if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # split train set and test set train_set = [dataset[i] for i in train_indices] n_train_batch = math.floor(len(train_set) / batch_size) val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = linear_models.get_sparse_model(model_name, train_set, val_set, n_features, n_epochs, learning_rate, batch_size) # register model model_name = "w.b" weight_length = n_features bias_length = 1 model_length = weight_length + bias_length ps_client.register_model(t_client, worker_index, model_name, model_length, n_workers) ps_client.exist_model(t_client, model_name) print("register and check model >>> name = {}, length = {}".format( model_name, model_length)) # Training the Model train_start = time.time() iter_counter = 0 for epoch in range(n_epochs): epoch_start = time.time() epoch_cal_time = 0 epoch_comm_time = 0 epoch_loss = 0. for batch_idx in range(n_train_batch): batch_start = time.time() batch_comm_time = 0 # pull latest model ps_client.can_pull(t_client, model_name, iter_counter, worker_index) latest_model = ps_client.pull_model(t_client, model_name, iter_counter, worker_index) model.weight = torch.from_numpy( np.asarray(latest_model[:weight_length]).astype( np.float32).reshape(n_features, 1)) model.bias = float(latest_model[-1]) batch_comm_time += time.time() - batch_start batch_loss, batch_acc = model.one_batch() epoch_loss += batch_loss.average w_b = np.concatenate((model.weight.double().numpy().flatten(), np.array([model.bias]).astype(np.double))) w_b_update = np.subtract(w_b, latest_model) batch_cal_time = time.time() - batch_start # push gradient to PS batch_comm_start = time.time() ps_client.can_push(t_client, model_name, iter_counter, worker_index) ps_client.push_grad(t_client, model_name, w_b_update, 1.0 / n_workers, iter_counter, worker_index) ps_client.can_pull(t_client, model_name, iter_counter + 1, worker_index) # sync all workers batch_comm_time += time.time() - batch_comm_start epoch_cal_time += batch_cal_time epoch_comm_time += batch_comm_time if batch_idx % 10 == 0: print( 'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f, Loss: %.4f, Accuracy: %.4f,' 'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s' % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, batch_loss.average, batch_acc.accuracy, time.time() - batch_start, batch_cal_time, batch_comm_time)) iter_counter += 1 # Test the Model test_start = time.time() test_loss, test_acc = model.evaluate() test_time = time.time() - test_start print( "Epoch: [{}/{}] finishes, Batch: [{}/{}], Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, " "calculation cost = {:.4f} s, synchronization cost {:.4f} s, test cost {:.4f} s, " "accuracy of the model on the {} test samples: {}, loss = {}". format(epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_comm_time, test_time, len(val_set), test_acc.accuracy, test_loss.average)) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))