def partition_yfcc100m(file_list, n_features, pos_tag, batch_size, validation_ratio): parse_start = time.time() f = open(file_list[0]).readlines() dataset = DenseLibsvmDataset(f, n_features, pos_tag) if len(file_list) > 1: for file_name in file_list[1:]: f = open(file_name).readlines() dataset.add_more(f) total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) random_seed = 42 np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s".format(time.time() - preprocess_start)) return train_loader, test_loader
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'].split(",") merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] pos_tag = event['pos_tag'] num_epochs = event['num_epochs'] num_admm_epochs = event['num_admm_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] lam = event['lambda'] rho = event['rho'] elasti_location = event['elasticache'] endpoint = memcached_init(elasti_location) print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('merge bucket = {}'.format(merged_bucket)) print('num epochs = {}'.format(num_epochs)) print('num admm epochs = {}'.format(num_admm_epochs)) print('num classes = {}'.format(num_classes)) print('num features = {}'.format(num_features)) print('positive tag = {}'.format(pos_tag)) print('learning rate = {}'.format(learning_rate)) print("batch_size = {}".format(batch_size)) print("lambda = {}".format(lam)) print("rho = {}".format(rho)) # read file from s3 file = get_object(bucket, key[0]).read().decode('utf-8').split("\n") dataset = DenseLibsvmDataset(file, num_features, pos_tag) if len(key) > 1: for more_key in key[1:]: file = get_object(bucket, more_key).read().decode('utf-8').split("\n") dataset.add_more(file) print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}" .format(time.time() - preprocess_start, dataset_size)) model = LogisticRegression(num_features, num_classes).double() print("size of w = {}".format(model.linear.weight.data.size())) z, u = initialize_z_and_u(model.linear.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() stop = False for admm_epoch in range(num_admm_epochs): print("ADMM Epoch >>> {}".format(admm_epoch)) admm_epoch_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items.double()) classify_loss = criterion(outputs, labels) epoch_loss += classify_loss.data u_z = torch.from_numpy(u).double() - torch.from_numpy(z).double() loss = classify_loss for name, param in model.named_parameters(): if name.split('.')[-1] == "weight": loss += rho / 2.0 * torch.norm(param + u_z, p=2) #loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z)) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print('ADMM Epoch: [%d/%d], Epoch: [%d/%d], Batch [%d], ' 'Time: %.4f, Loss: %.4f, epoch cost %.4f, test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (admm_epoch, num_admm_epochs, epoch, num_epochs, batch_index, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, test_time, len(val_indices), 100 * correct / total, test_loss / total)) w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape u_shape = u.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_and_b.flatten())) cal_time = time.time() - admm_epoch_start sync_start = time.time() postfix = "{}".format(admm_epoch) u_w_b_merge = reduce_epoch(endpoint, u_w_b, merged_bucket, num_workers, worker_index, postfix) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(num_workers) w_mean = u_w_b_merge[u_shape[0]*u_shape[1] : u_shape[0]*u_shape[1]+w_shape[0]*w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[u_shape[0]*u_shape[1]+w_shape[0]*w_shape[1]:].reshape(b_shape[0]) / float(num_workers) #model.linear.weight.data = torch.from_numpy(w) model.linear.bias.data = torch.from_numpy(b_mean) sync_time = time.time() - sync_start #z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam) #stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho) #print("stop = {}".format(stop)) #z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, num_workers, lam) #print(z) u = u + model.linear.weight.data.numpy() - z #print(u) # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print('ADMM Epoch: [%d/%d], Time: %.4f, Loss: %.4f, ' 'ADMM epoch cost %.4f: computation cost %.4f s communication cost %.4f s test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (admm_epoch, num_admm_epochs, time.time() - train_start, epoch_loss.data, time.time() - admm_epoch_start, cal_time, sync_time, test_time, len(val_indices), 100 * correct / total, test_loss / total)) # Test the Model correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items.double()) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total)) if worker_index == 0: clear_bucket(endpoint) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'].split(",") num_classes = event['num_classes'] num_features = event['num_features'] pos_tag = event['pos_tag'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] host = event['host'] port = event['port'] print('bucket = {}'.format(bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print("file = {}".format(key)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('num classes = {}'.format(num_classes)) print('num features = {}'.format(num_features)) print('positive tag = {}'.format(pos_tag)) print('learning rate = {}'.format(learning_rate)) print("batch_size = {}".format(batch_size)) print("host = {}".format(host)) print("port = {}".format(port)) # Set thrift connection # Make socket transport = TSocket.TSocket(host, port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TBinaryProtocol.TBinaryProtocol(transport) # Create a client to use the protocol encoder t_client = ParameterServer.Client(protocol) # Connect! transport.open() # test thrift connection ps_client.ping(t_client) print("create and ping thrift server >>> HOST = {}, PORT = {}".format( host, port)) # read file from s3 file = get_object(bucket, key[0]).read().decode('utf-8').split("\n") dataset = DenseLibsvmDataset(file, num_features, pos_tag) if len(key) > 1: for more_key in key[1:]: file = get_object(bucket, more_key).read().decode('utf-8').split("\n") dataset.add_more(file) print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(VALIDATION_RATIO * dataset_size)) if SHUFFLE_DATASET: np.random.seed(RANDOM_SEED) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = SVM(NUM_FEATURES, NUM_CLASSES) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) # register model model_name = "w.b" weight_shape = model.linear.weight.data.numpy().shape weight_length = weight_shape[0] * weight_shape[1] bias_shape = model.linear.bias.data.numpy().shape bias_length = bias_shape[0] model_length = weight_length + bias_length ps_client.register_model(t_client, worker_index, model_name, model_length, num_workers) ps_client.exist_model(t_client, model_name) print("register and check model >>> name = {}, length = {}".format( model_name, model_length)) # Training the Model train_start = time.time() iter_counter = 0 for epoch in range(NUM_EPOCHS): epoch_start = time.time() for batch_index, (items, labels) in enumerate(train_loader): print("------worker {} epoch {} batch {}------".format( worker_index, epoch, batch_index)) batch_start = time.time() # pull latest model ps_client.can_pull(t_client, model_name, iter_counter, worker_index) latest_model = ps_client.pull_model(t_client, model_name, iter_counter, worker_index) model.linear.weight = Parameter( torch.from_numpy( np.asarray(latest_model[:weight_length], dtype=np.double).reshape(weight_shape))) model.linear.bias = Parameter( torch.from_numpy( np.asarray(latest_model[weight_length:], dtype=np.double).reshape(bias_shape[0]))) items = Variable(items.view(-1, NUM_FEATURES)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items.double()) loss = criterion(outputs, labels) loss.backward() # flatten and concat gradients of weight and bias w_b_grad = np.concatenate( (model.linear.weight.grad.data.numpy().flatten(), model.linear.bias.grad.data.numpy().flatten())) cal_time = time.time() - batch_start # push gradient to PS sync_start = time.time() ps_client.can_push(t_client, model_name, iter_counter, worker_index) ps_client.push_grad(t_client, model_name, w_b_grad, LEARNING_RATE, iter_counter, worker_index) ps_client.can_pull(t_client, model_name, iter_counter + 1, worker_index) # sync all workers sync_time = time.time() - sync_start print( 'Epoch: [%d/%d], Step: [%d/%d] >>> Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s' % (epoch + 1, NUM_EPOCHS, batch_index + 1, len(train_indices) / BATCH_SIZE, time.time() - train_start, loss.data, time.time() - epoch_start, time.time() - batch_start, cal_time, sync_time)) iter_counter += 1 # Test the Model correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, NUM_FEATURES)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print( 'Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (time.time() - train_start, len(val_indices), 100 * correct / total, test_loss)) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'].split(",") merged_bucket = event['merged_bucket'] num_classes = event['num_classes'] num_features = event['num_features'] pos_tag = event['pos_tag'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] elasti_location = event['elasticache'] endpoint = memcached_init(elasti_location) print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('merge bucket = {}'.format(merged_bucket)) print('num epochs = {}'.format(num_epochs)) print('num classes = {}'.format(num_classes)) print('num features = {}'.format(num_features)) print('positive tag = {}'.format(pos_tag)) print('learning rate = {}'.format(learning_rate)) print("batch_size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key[0]).read().decode('utf-8').split("\n") dataset = DenseLibsvmDataset(file, num_features, pos_tag) if len(key) > 1: for more_key in key[1:]: file = get_object(bucket, more_key).read().decode('utf-8').split("\n") dataset.add_more(file) print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() epoch_loss = 0 for batch_index, (items, labels) in enumerate(train_loader): batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) epoch_loss += loss.data loss.backward() optimizer.step() w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape w_and_b = np.concatenate((w.flatten(), b.flatten())) cal_time = time.time() - epoch_start sync_start = time.time() postfix = str(epoch) u_w_b_merge = reduce_epoch(endpoint, w_and_b, merged_bucket, num_workers, worker_index, postfix) w_mean = u_w_b_merge[:w_shape[0] * w_shape[1]].reshape(w_shape) / float(num_workers) b_mean = u_w_b_merge[w_shape[0] * w_shape[1]:].reshape( b_shape[0]) / float(num_workers) model.linear.weight.data = torch.from_numpy(w_mean) model.linear.bias.data = torch.from_numpy(b_mean) sync_time = time.time() - sync_start # Test the Model test_start = time.time() correct = 0 total = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'Epoch: [%d/%d] has %d batches, Time: %.4f, Loss: %.4f, ' 'epoch cost %.4f: computation cost %.4f s communication cost %.4f s test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, num_epochs, batch_index, time.time() - train_start, epoch_loss.data, time.time() - epoch_start, cal_time, sync_time, test_time, len(val_indices), 100 * correct / total, test_loss / total)) # Test the Model correct = 0 total = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) # items = Variable(items) outputs = model(items) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) if worker_index == 0: clear_bucket(endpoint) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def run(args): device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') torch.manual_seed(1234) logging.info(f"{args.rank}-th worker starts.") read_start = time.time() f_id_start = args.rank * args.num_files f_id_end = f_id_start + args.num_files f_path_list = [ "{}/{}".format(args.root, i) for i in range(f_id_start, f_id_end) ] f = open(f_path_list[0]).readlines() dataset = DenseLibsvmDataset(f, args.features, args.pos_tag) if len(f_path_list) > 1: for file_name in f_path_list[1:]: f = open(file_name).readlines() dataset.add_more(f) total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) train_set = np.array(dataset.ins_list) dt = train_set.dtype centroid_shape = (args.num_clusters, train_set.shape[1]) logging.info(f"Loading dataset costs {time.time() - read_start}s") logging.info(f"centorid shape: {centroid_shape}") # initialize centroids init_cent_start = time.time() if args.rank == 0: centroids = torch.tensor(train_set[0:args.num_clusters]) else: centroids = torch.empty(args.num_clusters, args.features) if dist_is_initialized(): dist.broadcast(centroids, 0) logging.info( f"Receiving initial centroids costs {time.time() - init_cent_start}s") training_start = time.time() avg_error = np.iinfo(np.int16).max for epoch in range(args.epochs): if avg_error >= args.threshold: start_compute = time.time() model = Kmeans(train_set, centroids, avg_error, centroid_type='tensor') model.find_nearest_cluster() end_compute = time.time() #logging.info(f"{args.rank}-th worker computing centroids takes {end_compute - start_compute}s") sync_start = time.time() if dist_is_initialized(): centroids, avg_error = broadcast_average( args, model.get_centroids("dense_tensor"), torch.tensor(model.error)) logging.info(f"{args.rank}-th worker finished {epoch} epoch. " f"Computing takes {end_compute - start_compute}s." f"Communicating takes {time.time() - sync_start}s. " #f"Centroids: {model.get_centroids('dense_tensor')}. " f"Loss: {model.error}") else: logging.info( f"{args.rank}-th worker finished training. Error = {avg_error}, centroids = {centroids}" ) logging.info( f"Whole process time : {time.time() - training_start}") return