def compute_average_centroids(avg_cent_bucket, worker_cent_bucket, num_workers, shape, epoch, dt): num_files = 0 centroids_vec_list = [] error_list = [] while num_files < num_workers: num_files = 0 centroids_vec_list = [] error_list = [] objects = list_bucket_objects(worker_cent_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') data = get_object(worker_cent_bucket, file_key).read() cent_with_error = np.frombuffer(data, dtype=dt) cent = cent_with_error[0:-1].reshape(shape) error = cent_with_error[-1] centroids_vec_list.append(cent) error_list.append(error) num_files = num_files + 1 else: print('No objects in {}'.format(worker_cent_bucket)) avg = avg_centroids(centroids_vec_list) avg_error = np.mean(np.array(error_list)) clear_bucket(worker_cent_bucket) print(f"Average error for {epoch}-th epoch: {avg_error}") res = avg.reshape(-1) res = np.append(res, avg_error).astype(dt) put_object(avg_cent_bucket, f"avg-{epoch}", res.tobytes()) return 1
def put_merged_w_b(bucket_name, w, b, file_postfix, w_prefix="w_", b_prefix="b_"): print('put merged weight {} to bucket {}'.format(w_prefix + file_postfix, bucket_name)) put_object(bucket_name, w_prefix + file_postfix, w.tobytes()) print('put merged bias {} to bucket {}'.format(b_prefix + file_postfix, bucket_name)) put_object(bucket_name, b_prefix + file_postfix, b.tobytes())
def put_merged_w_b_grad(bucket_name, w_grad, b_grad, file_postfix, w_grad_prefix="w_grad_", b_grad_prefix="b_grad"): print('put merged weight grad {} to bucket {}'.format( w_grad_prefix + file_postfix, bucket_name)) put_object(bucket_name, w_grad_prefix + file_postfix, w_grad.tobytes()) print('put merged bias grad {} to bucket {}'.format( b_grad_prefix + file_postfix, bucket_name)) put_object(bucket_name, b_grad_prefix + file_postfix, b_grad.tobytes())
def reduce_batch(vector, tmp_bucket, merged_bucket, num_workers, worker_index, postfix): # vector is supposed to be a 1-d numpy array vec_shape = vector.shape vec_dtype = vector.dtype merged_vec = np.zeros(vec_shape, dtype=vec_dtype) postfix_splits = postfix.split("_") curr_epoch = int(postfix_splits[0]) curr_batch = int(postfix_splits[1]) # put object to s3, format of key: workerID_epoch_batch key = "{}_{}".format(worker_index, postfix) put_object(tmp_bucket, key, vector.tobytes()) # the first worker read and aggregate the corresponding chunk if worker_index == 0: num_files = 0 while num_files < num_workers: objects = list_bucket_objects(tmp_bucket) if objects is not None: delete_list = [] for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") key_epoch = key_splits[1] key_batch = key_splits[2] if key_epoch == str(curr_epoch) and key_batch == str( curr_batch): data = get_object(tmp_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vec_dtype) tmp_vec = bytes_data.reshape(vec_shape) merged_vec += tmp_vec num_files += 1 delete_list.append(file_key) delete_objects(tmp_bucket, delete_list) # write the merged data back to s3 merged_file_name = 'merged_' + postfix put_object(merged_bucket, merged_file_name, merged_vec.tobytes()) delete_expired_merged_batch(merged_bucket, curr_epoch, curr_batch) else: merged_file_name = 'merged_' + postfix merged_data = get_object_or_wait(merged_bucket, merged_file_name, 0.1).read() merged_vec = np.frombuffer(merged_data, dtype=vec_dtype).reshape(vec_shape) return merged_vec
def handler(event, context): bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8') print('bucket = {}'.format(bucket)) print('key = {}'.format(key)) num_files = 5 for i in np.arange(num_files): w = np.random.rand(2, 3).astype(np.float32) print("the {}-th numpy array".format(i)) print(w) put_object(tmp_bucket, "weight_" + str(i), w.tobytes()) arr = merge_np_bytes(tmp_bucket, num_files, np.float32, [2, 3]) t = torch.from_numpy(arr) print(t)
def handler(event, context): start_time = time.time() bucket = event['bucket'] key = event['name'] num_features = event['num_features'] num_classes = event['num_classes'] elasti_location = event['elasticache'] endpoint = memcached_init(elasti_location) print('bucket = {}'.format(bucket)) print('key = {}'.format(key)) key_splits = key.split("_") worker_index = int(key_splits[0]) num_worker = event['num_files'] batch_size = 100000 batch_size = int(np.ceil(batch_size / num_worker)) torch.manual_seed(random_seed) # read file(dataset) from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = DenseDatasetWithLines(file, num_features) preprocess_start = time.time() print("libsvm operation cost {}s".format(parse_start - preprocess_start)) # Creating data indices for training and validation splits: dataset_size = len(dataset) print("dataset size = {}".format(dataset_size)) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s".format(time.time() - preprocess_start)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) train_loss = [] test_loss = [] test_acc = [] total_time = 0 # Training the Model epoch_start = time.time() for epoch in range(num_epochs): tmp_train = 0 for batch_index, (items, labels) in enumerate(train_loader): #batch_start = time.time() print("------worker {} epoch {} batch {}------".format( worker_index, epoch, batch_index)) items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) loss.backward() optimizer.step() w = model.linear.weight.data.numpy() b = model.linear.bias.data.numpy() file_postfix = "{}_{}".format(batch_index, epoch) #asynchronization / shuffle starts from that every worker writes their gradients of this batch and epoch #upload individual gradient if batch_index == 0 and epoch == 0: hset_object(endpoint, model_bucket, w_prefix, w.tobytes()) hset_object(endpoint, model_bucket, b_prefix, b.tobytes()) time.sleep(0.0001) #randomly get one from others. (Asynchronized) w_new = np.fromstring(hget_object(endpoint, model_bucket, w_prefix), dtype=w.dtype).reshape(w.shape) b_new = np.fromstring(hget_object(endpoint, model_bucket, b_prefix), dtype=b.dtype).reshape(b.shape) else: w_new = np.fromstring(hget_object(endpoint, model_bucket, w_prefix), dtype=w.dtype).reshape(w.shape) b_new = np.fromstring(hget_object(endpoint, model_bucket, b_prefix), dtype=b.dtype).reshape(b.shape) hset_object(endpoint, model_bucket, w_prefix, w.tobytes()) hset_object(endpoint, model_bucket, b_prefix, b.tobytes()) model.linear.weight.data = torch.from_numpy(w_new) model.linear.bias.data = torch.from_numpy(b_new) #report train loss and test loss for every mini batch if (batch_index + 1) % 1 == 0: print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, loss.data)) tmp_train += loss.item() total_time += time.time() - epoch_start train_loss.append(tmp_train) tmp_test, tmp_acc = test(model, validation_loader, criterion) test_loss.append(tmp_test) test_acc.append(tmp_acc) epoch_start = time.time() print("total time = {}".format(total_time)) end_time = time.time() print("elapsed time = {} s".format(end_time - start_time)) loss_record = [test_loss, test_acc, train_loss, total_time] put_object("async-model-loss", "async-loss{}".format(worker_index), pickle.dumps(loss_record))
def handler(event, context): start_time = time.time() bucket = event['bucket'] key = event['name'] num_features = event['num_features'] num_classes = event['num_classes'] redis_location = event['elasticache'] endpoint = redis_init(redis_location) print('bucket = {}'.format(bucket)) print('key = {}'.format(key)) key_splits = key.split("_") num_worker = event['num_files'] worker_index = event['worker_index'] batch_size = 100000 batch_size = int(np.ceil(batch_size / num_worker)) torch.manual_seed(random_seed) # read file(dataset) from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = DenseDatasetWithLines(file, num_features) preprocess_start = time.time() print("libsvm operation cost {}s".format(parse_start - preprocess_start)) # Creating data indices for training and validation splits: dataset_size = len(dataset) print("dataset size = {}".format(dataset_size)) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s".format(time.time() - preprocess_start)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) train_loss = [] test_loss = [] test_acc = [] epoch_time = 0 epoch_start = time.time() # Training the Model for epoch in range(num_epochs): tmp_train = 0 for batch_index, (items, labels) in enumerate(train_loader): print("------worker {} epoch {} batch {}------".format( worker_index, epoch, batch_index)) items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) loss.backward() w_grad = model.linear.weight.grad.data.numpy() b_grad = model.linear.bias.grad.data.numpy() #synchronization starts from that every worker writes their gradients of this batch and epoch sync_start = time.time() hset_object(endpoint, grad_bucket, w_grad_prefix + str(worker_index), w_grad.tobytes()) hset_object(endpoint, grad_bucket, b_grad_prefix + str(worker_index), b_grad.tobytes()) tmp_write_local_epoch_time = time.time() - sync_start print("write local gradient cost = {}".format( tmp_write_local_epoch_time)) #merge gradients among files file_postfix = "{}_{}".format(epoch, batch_index) if worker_index == 0: w_grad_merge, b_grad_merge = \ merge_w_b_grads(endpoint, grad_bucket, num_worker, w_grad.dtype, w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix) put_merged_w_b_grads(endpoint, model_bucket, w_grad_merge, b_grad_merge, file_postfix, w_grad_prefix, b_grad_prefix) hset_object(endpoint, model_bucket, "epoch", epoch) hset_object(endpoint, model_bucket, "index", batch_index) else: w_grad_merge, b_grad_merge = get_merged_w_b_grads( endpoint, model_bucket, file_postfix, w_grad.dtype, w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix) model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge)) tmp_sync_time = time.time() - sync_start print("synchronization cost {} s".format(tmp_sync_time)) optimizer.step() tmp_train = tmp_train + loss.item() train_loss.append(tmp_train / (batch_index + 1)) epoch_time += time.time() - epoch_start # Test the Model correct = 0 total = 0 tmp_test = 0 count = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) outputs = model(items) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() loss = criterion(outputs, labels) tmp_test = tmp_test + loss.item() count += 1 print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) test_loss.append(tmp_test / count) test_acc.append(100 * correct / total) epoch_start = time.time() loss_record = [test_loss, test_acc, train_loss, epoch_time] put_object("grad-average-loss", "grad-loss{}".format(worker_index), bytes(loss_record))
def handler(event, context): avg_error = np.iinfo(np.int16).max num_features = event['num_features'] num_clusters = event['num_clusters'] worker_cent_bucket = event["worker_cent_bucket"] avg_cent_bucket = event["avg_cent_bucket"] num_epochs = event["num_epochs"] threshold = event["threshold"] dataset_type = event["dataset_type"] elastic_location = event["elasticache"] elastic_endpoint = memcached_init(elastic_location) print(elastic_endpoint) #Reading data from S3 bucket_name = event['bucket_name'] key = urllib.parse.unquote_plus(event['key'], encoding='utf-8') logger.info( f"Reading training data from bucket = {bucket_name}, key = {key}") key_splits = key.split("_") num_worker = int(key_splits[-1]) worker_index = int(key_splits[0]) event_start = time.time() file = get_object(bucket_name, key).read().decode('utf-8').split("\n") s3_end = time.time() logger.info(f"Getting object from s3 takes {s3_end - event_start}s") if dataset_type == "dense": # dataset is stored as numpy array dataset = DenseDatasetWithLines(file, num_features).ins_np dt = dataset.dtype centroid_shape = (num_clusters, dataset.shape[1]) else: # dataset is sparse, stored as sparse tensor dataset = SparseDatasetWithLines(file, num_features) first_entry = dataset.ins_list[0].to_dense().numpy() dt = first_entry.dtype centroid_shape = (num_clusters, first_entry.shape[1]) parse_end = time.time() logger.info(f"Parsing dataset takes {parse_end - s3_end}s") logger.info( f"worker index: {worker_index},Dataset: {dataset_type}, dtype: {dt}. Centroids shape: {centroid_shape}. num_features: {num_features}" ) if worker_index == 0: if dataset_type == "dense": centroids = dataset[0:num_clusters].reshape(-1) hset_object(elastic_endpoint, avg_cent_bucket, "initial", centroids.tobytes()) centroids = centroids.reshape(centroid_shape) else: centroids = store_centroid_as_numpy( dataset.ins_list[0:num_clusters], num_clusters) hset_object(elastic_endpoint, avg_cent_bucket, "initial", centroids.tobytes()) else: cent = hget_object_or_wait(elastic_endpoint, avg_cent_bucket, "initial", 0.00001) centroids = process_centroid(cent, num_clusters, dt) #centroids = np.frombuffer(cent,dtype=dt) if centroid_shape != centroids.shape: logger.error("The shape of centroids does not match.") logger.info( f"Waiting for initial centroids takes {time.time() - parse_end} s") training_start = time.time() sync_time = 0 for epoch in range(num_epochs): logger.info(f"{worker_index}-th worker in {epoch}-th epoch") epoch_start = time.time() if epoch != 0: last_epoch = epoch - 1 cent_with_error = hget_object_or_wait(elastic_endpoint, avg_cent_bucket, f"avg-{last_epoch}", 0.00001) wait_end = time.time() if worker_index != 0: logger.info( f"Wait for centroid for {epoch}-th epoch. Takes {wait_end - epoch_start}" ) sync_time += wait_end - epoch_start avg_error, centroids = process_centroid(cent_with_error, num_clusters, dt, True) if avg_error >= threshold: print("get new centro") res = get_new_centroids(dataset, dataset_type, centroids, epoch, num_features, num_clusters) #dt = res.dtype sync_start = time.time() success = hset_object(elastic_endpoint, worker_cent_bucket, f"{worker_index}_{epoch}", res.tobytes()) if worker_index == 0 and success: compute_average_centroids(elastic_endpoint, avg_cent_bucket, worker_cent_bucket, num_worker, centroid_shape, epoch, dt) logger.info( f"Waiting for all workers takes {time.time() - sync_start} s" ) if epoch != 0: sync_time += time.time() - sync_start else: print("sync time = {}".format(sync_time)) logger.info( f"{worker_index}-th worker finished training. Error = {avg_error}, centroids = {centroids}" ) logger.info(f"Whole process time : {time.time() - training_start}") return print("sync time = {}".format(sync_time)) put_object("kmeans-time", "time_{}".format(worker_index), np.asarray(sync_time).tostring())
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) s3 = boto3.client('s3') # read file from s3 s3.download_file(bucket, feature_file_name, local_dir + str(feature_file_name)) features_matrix = np.load(local_dir + str(feature_file_name)) print("read features matrix cost {} s".format(time.time() - start_time)) print("feature matrix shape = {}, dtype = {}".format( features_matrix.shape, features_matrix.dtype)) print("feature matrix sample = {}".format(features_matrix[0])) row_features = features_matrix.shape[0] col_features = features_matrix.shape[1] s3.download_file(bucket, label_file_name, local_dir + str(label_file_name)) labels_matrix = np.load(local_dir + str(label_file_name)) print("read label matrix cost {} s".format(time.time() - start_time)) print("label matrix shape = {}, dtype = {}".format(labels_matrix.shape, labels_matrix.dtype)) print("label matrix sample = {}".format(labels_matrix[0:10])) row_labels = labels_matrix.shape[0] if row_features != row_labels: raise AssertionError( "row of feature matrix is {}, but row of label matrix is {}.". format(row_features, row_labels)) features_matrix = features_matrix.flatten() samples_per_file = row_features / n_files for i in range(n_files): start_row = i * samples_per_file end_row = (i + 1) * samples_per_file features_file_name = "features_{}_{}".format(i, n_files) labels_file_name = "labels_{}_{}".format(i, n_files) put_object( bucket, features_file_name, features_matrix[start_row * col_features:end_row * col_features].tobytes()) put_object(bucket, labels_file_name, labels_matrix[start_row:end_row].tobytes()) if worker_index == 0: clear_bucket(merged_bucket) clear_bucket(tmp_bucket) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def put_merged(bucket_name, merged_value, prefix, file_postfix): # print('put merged weight {} to bucket {}'.format(w_prefix + file_postfix, bucket_name)) put_object(bucket_name, prefix + file_postfix, pickle.dumps(merged_value))
def scatter_reduce(vector, tmp_bucket, merged_bucket, num_workers, myrank, postfix): # vector is supposed to be a 1-d numpy array num_all_values = vector.size num_values_per_worker = num_all_values // num_workers residue = num_all_values % num_workers curr_epoch = postfix.split("_")[0] curr_batch = postfix.split("_")[1] my_offset = (num_values_per_worker * myrank) + min(residue, myrank) my_length = num_values_per_worker + (1 if myrank < residue else 0) my_chunk = vector[my_offset:my_offset + my_length] # write partitioned vector to the shared memory, except the chunk charged by myself for i in range(num_workers): if i != myrank: offset = (num_values_per_worker * i) + min(residue, i) length = num_values_per_worker + (1 if i < residue else 0) # indicating the chunk number and which worker it comes from key = "{}_{}".format(i, myrank) # format of key in tmp-bucket: chunkID_workerID_epoch_batch put_object(tmp_bucket, key + '_' + postfix, vector[offset:offset + length].tobytes()) # read and aggergate the corresponding chunk num_files = 0 while num_files < num_workers - 1: objects = list_bucket_objects(tmp_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") # if it's the chunk I care and it is from the current step # format of key in tmp-bucket: chunkID_workerID_epoch_batch if key_splits[0] == str(myrank) and key_splits[ 2] == curr_epoch and key_splits[3] == curr_batch: data = get_object(tmp_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) my_chunk = my_chunk + bytes_data num_files += 1 delete_object(tmp_bucket, file_key) # write the aggregated chunk back # key format in merged_bucket: chunkID_epoch_batch put_object(merged_bucket, str(myrank) + '_' + postfix, my_chunk.tobytes()) # read other aggregated chunks merged_value = {} merged_value[myrank] = my_chunk num_merged_files = 0 already_read = [] while num_merged_files < num_workers - 1: objects = list_bucket_objects(merged_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") #key format in merged_bucket: chunkID_epoch_batch if key_splits[0] != str( myrank) and key_splits[1] == curr_epoch and key_splits[ 2] == curr_batch and file_key not in already_read: # if not file_key.startswith(str(myrank)) and file_key not in already_read: # key_splits = file_key.split("_") data = get_object(merged_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) merged_value[int(key_splits[0])] = bytes_data already_read.append(file_key) num_merged_files += 1 # reconstruct the whole vector result = merged_value[0] for k in range(1, num_workers): result = np.concatenate((result, merged_value[k])) # elif k == myrank: # result = np.concatenate((result, my_chunk)) # else: # result = np.concatenate((result, merged_value[k])) return result
def reduce_scatter_batch_multi_bucket(vector, tmp_bucket_prefix, merged_bucket_prefix, num_buckets, num_workers, myrank, postfix): # vector is supposed to be a 1-d numpy array num_all_values = vector.size num_values_per_worker = num_all_values // num_workers residue = num_all_values % num_workers curr_epoch = postfix.split("_")[0] curr_batch = postfix.split("_")[1] my_offset = (num_values_per_worker * myrank) + min(residue, myrank) my_length = num_values_per_worker + (1 if myrank < residue else 0) my_chunk = vector[my_offset:my_offset + my_length] # write partitioned vector to the shared memory, except the chunk charged by myself for i in range(num_workers): if i != myrank: offset = (num_values_per_worker * i) + min(residue, i) length = num_values_per_worker + (1 if i < residue else 0) # indicating the chunk number and which worker it comes from key = "{}_{}".format(i, myrank) tmp_bucket_ind = i % num_buckets tmp_bucket = "{}-{}".format(tmp_bucket_prefix, tmp_bucket_ind) # format of key in tmp-bucket: chunkID_workerID_epoch_batch put_object(tmp_bucket, key + '_' + postfix, vector[offset:offset + length].tobytes()) # read and aggeregate the corresponding chunk num_files = 0 tmp_bucket_ind = myrank % num_buckets tmp_bucket = "{}-{}".format(tmp_bucket_prefix, tmp_bucket_ind) print( "worker [{}] read and aggregate the corresponding chunks in bucket {}". format(myrank, tmp_bucket)) while num_files < num_workers - 1: objects = list_bucket_objects(tmp_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") # if it's the chunk I care and it is from the current step # format of key in tmp-bucket: chunkID_workerID_epoch_batch if key_splits[0] == str(myrank) and key_splits[ 2] == curr_epoch and key_splits[3] == curr_batch: print("get obj = {}".format(file_key)) data = get_object(tmp_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) my_chunk = my_chunk + bytes_data num_files += 1 delete_object(tmp_bucket, file_key) merged_bucket_ind = myrank % num_buckets my_merged_bucket = "{}-{}".format(merged_bucket_prefix, merged_bucket_ind) # write the aggregated chunk back # key format in merged_bucket: chunkID_epoch_batch put_object(my_merged_bucket, str(myrank) + '_' + postfix, my_chunk.tobytes()) # read other aggregated chunks merged_value = {myrank: my_chunk} bucket_num_objs = [] if num_workers % num_buckets == 0: bucket_num_objs = [ num_workers / num_buckets for _ in range(num_buckets) ] else: for i in range(num_buckets % num_buckets): num_buckets.append(num_workers / num_buckets + 1) for i in range(num_buckets % num_buckets, num_buckets): num_buckets.append(num_workers / num_buckets) # check boundary # do not count responsible chunk bucket_num_objs[myrank % num_buckets] -= 1 print("bucket num objs = {}".format(bucket_num_objs)) num_merged_files = 0 already_read = [] bucket_num_merged = [0 for _ in range(num_buckets)] while num_merged_files < num_workers - 1: for i in range(num_buckets): if bucket_num_merged[i] < bucket_num_objs[i]: merged_bucket = "{}-{}".format(merged_bucket_prefix, i) objects = list_bucket_objects(merged_bucket) if objects is not None: for obj in objects: file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8') key_splits = file_key.split("_") # key format in merged_bucket: chunkID_epoch_batch # if not file_key.startswith(str(myrank)) and file_key not in already_read: if key_splits[0] != str(myrank) and key_splits[1] == curr_epoch \ and key_splits[2] == curr_batch and file_key not in already_read: print("merge obj = {}".format(file_key)) data = get_object(merged_bucket, file_key).read() bytes_data = np.frombuffer(data, dtype=vector.dtype) merged_value[int(key_splits[0])] = bytes_data already_read.append(file_key) bucket_num_merged[i] += 1 num_merged_files += 1 # reconstruct the whole vector result = merged_value[0] for k in range(1, num_workers): result = np.concatenate((result, merged_value[k])) return result
def handler(event, context): startTs = time.time() bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8') print('bucket = {}'.format(bucket)) print('key = {}'.format(key)) key_splits = key.split("_") worker_index = int(key_splits[0]) num_worker = int(key_splits[1]) sync_meta = SyncMeta(worker_index, num_worker) print("synchronization meta {}".format(sync_meta.__str__())) # read file from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - startTs)) parse_start = time.time() dataset = DenseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s".format(time.time() - preprocess_start)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model for epoch in range(num_epochs): for batch_index, (items, labels) in enumerate(train_loader): print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) loss.backward() print("forward and backward cost {} s".format(time.time()-batch_start)) w_grad = model.linear.weight.grad.data.numpy() b_grad = model.linear.bias.grad.data.numpy() #print("dtype of grad = {}".format(w_grad.dtype)) print("w_grad before merge = {}".format(w_grad[0][0:5])) print("b_grad before merge = {}".format(b_grad)) sync_start = time.time() put_object(grad_bucket, w_grad_prefix + str(worker_index), w_grad.tobytes()) put_object(grad_bucket, b_grad_prefix + str(worker_index), b_grad.tobytes()) file_postfix = "{}_{}".format(epoch, batch_index) if worker_index == 0: w_grad_merge, b_grad_merge = \ merge_w_b_grads(grad_bucket, num_worker, w_grad.dtype, w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix) put_merged_w_b_grad(model_bucket, w_grad_merge, b_grad_merge, file_postfix, w_grad_prefix, b_grad_prefix) delete_expired_w_b(model_bucket, epoch, batch_index, w_grad_prefix, b_grad_prefix) model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge)) else: w_grad_merge, b_grad_merge = get_merged_w_b_grad(model_bucket, file_postfix, w_grad.dtype, w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix) model.linear.weight.grad = Variable(torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable(torch.from_numpy(b_grad_merge)) print("w_grad after merge = {}".format(model.linear.weight.grad.data.numpy()[0][:5])) print("b_grad after merge = {}".format(model.linear.bias.grad.data.numpy())) print("synchronization cost {} s".format(time.time() - sync_start)) optimizer.step() print("batch cost {} s".format(time.time() - batch_start)) if (batch_index + 1) % 10 == 0: print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, loss.data)) if worker_index == 0: clear_bucket(model_bucket) clear_bucket(grad_bucket) # Test the Model correct = 0 total = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) # items = Variable(items) outputs = model(items) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) endTs = time.time() print("elapsed time = {} s".format(endTs - startTs))
def handler(event, context): start_time = time.time() bucket = event['data_bucket'] worker_index = event['rank'] redis_location = event['redis'] endpoint = redis_init(redis_location) num_worker = event['num_workers'] key = 'training_{}.pt'.format(worker_index) print('data_bucket = {}\n worker_index:{}\n num_worker:{}\n key:{}'.format( bucket, worker_index, num_worker, key)) # read file from s3 readS3_start = time.time() train_path = download_file(bucket, key) test_path = download_file(bucket, test_file) train_set = torch.load(train_path) test_set = torch.load(test_path) print("read data cost {} s".format(time.time() - readS3_start)) print(train_set) train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') device = 'cpu' # best_acc = 0 # best test accuracy # start_epoch = 0 # start from epoch 0 or last checkpoint epoch #Model print('==> Building model..') # net = VGG('VGG19') # net = ResNet18() #net = ResNet50() # net = PreActResNet18() # net = GoogLeNet() # net = DenseNet121() # net = ResNeXt29_2x64d() net = MobileNet() # net = MobileNetV2() # net = DPN92() # net = ShuffleNetG2() # net = SENet18() # net = ShuffleNetV2(1) # net = EfficientNetB0() print("Model: MobileNet") net = net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4) for epoch in range(num_epochs): time_record = train(endpoint, epoch, net, train_loader, optimizer, criterion, device, worker_index, num_worker, sync_mode, sync_step) test(epoch, net, test_loader, criterion, device) put_object("time-record-s3", "time_{}".format(worker_index), pickle.dumps(time_record))
def train(endpoint, epoch, net, train_loader, optimizer, criterion, device, worker_index, num_worker, sync_mode, sync_step): # print('\nEpoch: %d' % epoch) net.train() # train_loss = 0 # correct = 0 # total = 0 sync_epoch_time = [] write_local_epoch_time = [] calculation_epoch_time = [] for batch_idx, (inputs, targets) in enumerate(train_loader): print("------worker {} epoch {} batch {}------".format( worker_index, epoch + 1, batch_idx + 1)) batch_start = time.time() inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() tmp_calculation_time = time.time() - batch_start print("forward and backward cost {} s".format(tmp_calculation_time)) if batch_idx != 0: calculation_epoch_time.append(tmp_calculation_time) if sync_mode == 'model_avg': # apply local gradient to local model optimizer.step() # average model if (batch_idx + 1) % sync_step == 0: sync_start = time.time() # get current weights weights = [param.data.numpy() for param in net.parameters()] # print("[Worker {}] Weights before sync = {}".format(worker_index, weights[0][0])) # upload updated weights to S3 #hset_object(endpoint, tmp_bucket, weights_prefix + str(worker_index), pickle.dumps(weights)) put_object_start = time.time() put_object(tmp_bucket, weights_prefix + str(worker_index), pickle.dumps(weights)) tmp_write_local_peoch_time = time.time() - put_object_start print("writing local gradients in s3 cost {}".format( tmp_write_local_epoch_time)) if batch_idx != 0: write_local_epoch_time.append(tmp_write_local_epoch_time) file_postfix = "{}_{}".format(epoch, batch_idx) if worker_index == 0: # merge all workers merged_value = \ merge_w_b_layers(endpoint, tmp_bucket, num_worker, weights_prefix) #while sync_counter(endpoint, model_bucket, num_worker): # time.sleep(0.0001) # upload merged value to S3 put_merged_w_b_layers(endpoint, merged_bucket, merged_value, weights_prefix, file_postfix) delete_expired_w_b_layers(endpoint, merged_bucket, epoch, batch_idx, weights_prefix) else: # get merged value from S3 merged_value = get_merged_w_b_layers( endpoint, merged_bucket, weights_prefix, file_postfix) # update the model with averaged model for layer_index, param in enumerate(net.parameters()): param.data = torch.nn.Parameter( torch.from_numpy(merged_value[layer_index])) tmp_sync_time = time.time() - sync_start print("synchronization cost {} s".format(tmp_sync_time)) if batch_idx != 0: sync_epoch_time.append(tmp_sync_time) if sync_mode == 'grad_avg': sync_start = time.time() gradients = [param.grad.data.numpy() for param in net.parameters()] # print("[Worker {}] Gradients before sync = {}".format(worker_index, gradients[0][0])) put_object_start = time.time() hset_object(endpoint, tmp_bucket, gradients_prefix + str(worker_index), pickle.dumps(gradients)) tmp_write_local_epoch_time = time.time() - put_object_start print("writing local gradients in redis cost {}".format( tmp_write_local_epoch_time)) if batch_idx != 0: write_local_epoch_time.append(tmp_write_local_epoch_time) file_postfix = "{}_{}".format(epoch, batch_idx) if worker_index == 0: # merge all workers merged_value_start = time.time() merged_value = \ merge_w_b_layers(endpoint, tmp_bucket, num_worker, gradients_prefix) print("merged_value cost {} s".format(time.time() - merged_value_start)) put_merged_start = time.time() # upload merged value to S3 put_merged_w_b_layers(endpoint, merged_bucket, merged_value, gradients_prefix, file_postfix) print("put_merged cost {} s".format(time.time() - put_merged_start)) delete_expired_w_b_layers(endpoint, merged_bucket, epoch, batch_idx, gradients_prefix) else: read_merged_start = time.time() # get merged value from redis merged_value = get_merged_w_b_layers(endpoint, merged_bucket, gradients_prefix, file_postfix) print("read_merged cost {} s".format(time.time() - read_merged_start)) for layer_index, param in enumerate(net.parameters()): param.grad = Variable( torch.from_numpy(merged_value[layer_index])) tmp_sync_time = time.time() - sync_start print("synchronization cost {} s".format(tmp_sync_time)) if batch_idx != 0: sync_epoch_time.append(tmp_sync_time) if worker_index == 0: delete_expired_w_b_layers(endpoint, merged_bucket, epoch, batch_idx, gradients_prefix) optimizer.step() # train_loss += loss.item() # _, predicted = outputs.max(1) # total += targets.size(0) # correct += predicted.eq(targets).sum().item() print("batch cost {} s".format(time.time() - batch_start)) if (batch_idx + 1) % 1 == 0: print('Epoch: {}, Step: {}, Loss:{}'.format( epoch + 1, batch_idx + 1, loss.data)) return sync_epoch_time, write_local_epoch_time, calculation_epoch_time
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] merged_bucket = event['merged_bucket'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] elasti_location = event['elasticache'] endpoint = memcached_init(elasti_location) print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = DenseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = SVM(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) train_loss = [] test_loss = [] test_acc = [] epoch_time = 0 # Training the Model epoch_start = time.time() for epoch in range(num_epochs): tmp_train = 0 for batch_index, (items, labels) in enumerate(train_loader): print("------worker {} epoch {} batch {}------".format( worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) loss.backward() optimizer.step() if (batch_index + 1) % 1 == 0: print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, loss.data)) tmp_train = tmp_train + loss.item() train_loss.append(tmp_train / (batch_index + 1)) # sync model w_model = model.linear.weight.data.numpy() b_model = model.linear.bias.data.numpy() epoch_time = time.time() - epoch_start + epoch_time # synchronization starts from that every worker writes their model after this epoch sync_start = time.time() hset_object(endpoint, merged_bucket, w_prefix + str(worker_index), w_model.tobytes()) hset_object(endpoint, merged_bucket, b_prefix + str(worker_index), b_model.tobytes()) tmp_write_local_epoch_time = time.time() - sync_start print("write local model cost = {}".format(tmp_write_local_epoch_time)) # merge gradients among files file_postfix = "{}".format(epoch) if worker_index == 0: merge_start = time.time() w_model_merge, b_model_merge = merge_w_b_grads( endpoint, merged_bucket, num_workers, w_model.dtype, w_model.shape, b_model.shape, w_prefix, b_prefix) put_merged_w_b_grads(endpoint, merged_bucket, w_model_merge, b_model_merge, file_postfix, w_prefix, b_prefix) else: w_model_merge, b_model_merge = get_merged_w_b_grads( endpoint, merged_bucket, file_postfix, w_model.dtype, w_model.shape, b_model.shape, w_prefix, b_prefix) model.linear.weight.data = Variable(torch.from_numpy(w_model_merge)) model.linear.bias.data = Variable(torch.from_numpy(b_model_merge)) tmp_sync_time = time.time() - sync_start print("synchronization cost {} s".format(tmp_sync_time)) # Test the Model correct = 0 total = 0 count = 0 tmp_test = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) outputs = model(items) loss = criterion(outputs, labels) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() tmp_test = tmp_test + loss.item() count = count + 1 # print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) test_acc.append(100 * correct / total) test_loss.append(tmp_test / count) epoch_start = time.time() end_time = time.time() print("elapsed time = {} s".format(end_time - start_time)) loss_record = [test_loss, test_acc, train_loss, epoch_time] put_object("model-average-loss", "average_loss{}".format(worker_index), pickle.dumps(loss_record))