def upload(filename, prefix=False, bucket_name=False, key=None, secret=None, host=None): """ Uploading files to Amamzon S3. """ s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host) name = None if isinstance(filename, basestring): filename = open(filename, 'rb') name = path.basename(filename) elif isinstance(filename, (file, File)): name = filename.name if not name: raise TypeError('Filename must be file or string instance.') if prefix: if prefix.endswith('/'): full_path = prefix + name else: full_path = prefix + '/' + name else: full_path = name key = s3.save(full_path, filename) return s3.url(full_path)
def storage_engine(): with mock_s3(): # Create a test bucket and put some test content. boto.connect_s3().create_bucket(_TEST_BUCKET) engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD) engine.put_content(_TEST_PATH, _TEST_CONTENT) yield engine
def test_copy(bucket, username, password, storage_engine): # Copy the content to another engine. another_engine = S3Storage(_TEST_CONTEXT, "another/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD) boto.connect_s3().create_bucket("another_bucket") storage_engine.copy_to(another_engine, _TEST_PATH) # Verify it can be retrieved. assert another_engine.get_content(_TEST_PATH) == _TEST_CONTENT
def test_stream_write_error(): with mock_s3(): # Create an engine but not the bucket. engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD) # Attempt to write to the uncreated bucket, which should raise an error. with pytest.raises(IOError): engine.stream_write(_TEST_PATH, StringIO("hello world"), content_type="Cool/Type") assert not engine.exists(_TEST_PATH)
def storage_engine(): with mock_s3(): # Create a test bucket and put some test content. boto3.client("s3").create_bucket(Bucket=_TEST_BUCKET) engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD, _TEST_REGION) assert engine._connect_kwargs[ "endpoint_url"] == "https://s3.{}.amazonaws.com".format( _TEST_REGION) engine.put_content(_TEST_PATH, _TEST_CONTENT) yield engine
def storage_engine(request): if request.param == 'test': yield test_storage else: with mock_s3(): # Create a test bucket and put some test content. boto.connect_s3().create_bucket(_TEST_BUCKET) engine = DistributedStorage( { 'foo': S3Storage(_TEST_CONTEXT, 'some/path', _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD) }, ['foo']) yield engine
def remove(name=None, prefix=False, bucket_name=False, key=None, secret=None, host=None): """ Deletes file from Amazon S3. """ full_path = _get_name(name, prefix) s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host) s3.delete(full_path)
def test_stream_write_error(): with mock_s3(): # Create an engine but not the bucket. engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD) # Attempt to write to the uncreated bucket, which should raise an error. with pytest.raises(IOError): engine.stream_write(_TEST_PATH, BytesIO(b"hello world"), content_type="Cool/Type") with pytest.raises(botocore.exceptions.ClientError) as excinfo: engine.exists(_TEST_PATH) assert s3r.value.response["Error"]["Code"] == "NoSuchBucket"
def download(name=None, prefix=False, bucket_name=False, key=None, secret=None, host=None): """ Download file from Amazon S3. Returns TemporaryFile(). """ full_path = _get_name(name, prefix) s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host) return s3.open(full_path)
def storage_engine(request): if request.param == "test": yield test_storage else: with mock_s3(): # Create a test bucket and put some test content. boto3.client("s3").create_bucket(Bucket=_TEST_BUCKET) engine = DistributedStorage( { "foo": S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD) }, ["foo"], ) yield engine
def get_url(name=None, prefix=False, bucket_name=False, key=None, secret=None, host=None, expires=30, query_auth=False, force_http=False): """ Get Url for key on Amazon S3. Returns String. """ full_path = _get_name(name, prefix) s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host) return s3.url(full_path, expires, query_auth, force_http)
def init(self): self.read_file_map() self.input_dir = self.config['input_dir'] self.output_dir = self.config['output_dir'] s_cfg = self.config['storage'] self.storage = S3Storage(**s_cfg) # read file map self.file_map self.minify_dir = os.path.join(self.output_dir, 'minify') self.hash_input_dir = os.path.join(self.output_dir, 'hash_input') self.hash_output_dir = os.path.join(self.output_dir, 'hash_output') self.prepare_dir(self.minify_dir) self.prepare_dir(self.hash_input_dir) self.prepare_dir(self.hash_output_dir) self.mini_js_ext = '.mini.js' self.mini_css_ext = '.mini.css' self.gzip_ext = '.gzip' # init JS groups self.js_config = FileConfig(self.input_dir) for group in self.config['js_groups']: name = group['name'] files = group['files'] gzip = group.get('gzip', True) self.js_config.add_group(name, files, gzip) # init CSS groups self.css_config = FileConfig(self.input_dir) for group in self.config['css_groups']: name = group['name'] files = group['files'] gzip = group.get('gzip', True) self.css_config.add_group(name, files, gzip) self.hash_file = HashFile(self.hash_input_dir, self.hash_output_dir, hash_version=self.config.get( 'hash_version', ''))
def upload(filename, name=None, prefix=False, bucket_name=False, key=None, secret=None, host=None, expires=30, query_auth=False, force_http=False, policy=None, replace=True): """ Uploading files to Amamzon S3. Returns String. """ if isinstance(filename, basestring): fl = open(filename, 'rb') elif isinstance(filename, (file, File)): fl = filename else: raise TypeError('File must be file or string instance.') if not name: name = fl.name full_path = _get_name(name, prefix) s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host, policy=policy, replace=replace) s3.save(full_path, fl) return s3.url(full_path, expires, query_auth, force_http)
def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] assert dataset_type == "dense_libsvm" n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] tmp_table_name = event['tmp_table_name'] merged_table_name = event['merged_table_name'] key_col = event['key_col'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Linear_Models assert optim.lower() == Optimization.ADMM assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] n_admm_epochs = event['n_admm_epochs'] lam = event['lambda'] rho = event['rho'] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() dynamo_client = dynamo_operator.get_client() tmp_table = DynamoTable(dynamo_client, tmp_table_name) merged_table = DynamoTable(dynamo_client, merged_table_name) communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table, key_col, n_workers, worker_index) # Read file from s3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) shuffle_dataset = True random_seed = 100 if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) n_train_batch = len(train_loader) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = linear_models.get_model(model_name, n_features, n_classes) z, u = initialize_z_and_u(model.linear.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model train_start = time.time() for admm_epoch in range(n_admm_epochs): print(">>> ADMM Epoch[{}]".format(admm_epoch)) admm_epoch_start = time.time() admm_epoch_cal_time = 0 admm_epoch_comm_time = 0 admm_epoch_test_time = 0 for epoch in range(n_epochs): epoch_start = time.time() epoch_loss = 0. for batch_index, (items, labels) in enumerate(train_loader): batch_start = time.time() items = Variable(items.view(-1, n_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) classify_loss = criterion(outputs, labels) epoch_loss += classify_loss.item() u_z = torch.from_numpy(u) - torch.from_numpy(z) loss = classify_loss for name, param in model.named_parameters(): if name.split('.')[-1] == "weight": loss += rho / 2.0 * torch.norm(param + u_z, p=2) # loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z)) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() epoch_cal_time = time.time() - epoch_start admm_epoch_cal_time += epoch_cal_time # Test the Model test_start = time.time() n_test_correct = 0 n_test = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, n_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).item() _, predicted = torch.max(outputs.data, 1) n_test += labels.size(0) n_test_correct += (predicted == labels).sum() epoch_test_time = time.time() - test_start admm_epoch_test_time += epoch_test_time print( 'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'cal cost %.4f s, test cost %.4f s, accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, n_epochs, batch_index + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_test_time, n_test, 100. * n_test_correct / n_test, test_loss / n_test)) sync_start = time.time() w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape u_shape = u.shape w_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_b.flatten())) # admm does not support async if sync_mode == "reduce": u_w_b_merge = communicator.reduce_epoch(u_w_b, admm_epoch) elif sync_mode == "reduce_scatter": u_w_b_merge = communicator.reduce_scatter_epoch(u_w_b, admm_epoch) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(n_workers) w_mean = u_w_b_merge[u_shape[0] * u_shape[1]: u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]]\ .reshape(w_shape) / float(n_workers) b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:]\ .reshape(b_shape[0]) / float(n_workers) model.linear.weight.data = torch.from_numpy(w_mean) model.linear.bias.data = torch.from_numpy(b_mean) admm_epoch_comm_time += time.time() - sync_start if worker_index == 0: delete_start = time.time() communicator.delete_expired_epoch(admm_epoch) admm_epoch_comm_time += time.time() - delete_start # z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam) # stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho) # print("stop = {}".format(stop)) # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, n_workers, lam) u = u + model.linear.weight.data.numpy() - z print( "ADMM Epoch[{}] finishes, cost {} s, cal cost {} s, sync cost {} s, test cost {} s" .format(admm_epoch, time.time() - admm_epoch_start, admm_epoch_cal_time, admm_epoch_comm_time, admm_epoch_test_time)) # Test the Model n_test_correct = 0 n_test = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, n_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).item() _, predicted = torch.max(outputs.data, 1) n_test += labels.size(0) n_test_correct += (predicted == labels).sum() print( 'Train finish, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f' % (time.time() - train_start, n_test, 100. * n_test_correct / n_test, test_loss / n_test)) if worker_index == 0: s3_storage.clear(tmp_table_name) s3_storage.clear(merged_table_name) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def get_storage(config, verify=True): return S3Storage(config, verify=verify) if config.USE_S3 else LocalStorage()
def handler(event, context): # dataset data_bucket = event['data_bucket'] file = event['file'] dataset_type = event["dataset_type"] assert dataset_type == "dense_libsvm" n_features = event['n_features'] n_workers = event["n_workers"] worker_index = event['worker_index'] tmp_table_name = event['tmp_table_name'] merged_table_name = event['merged_table_name'] key_col = event['key_col'] # hyper-parameter n_clusters = event['n_clusters'] n_epochs = event["n_epochs"] threshold = event["threshold"] sync_mode = event["sync_mode"] assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('num clusters = {}'.format(n_clusters)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() dynamo_client = dynamo_operator.get_client() tmp_table = DynamoTable(dynamo_client, tmp_table_name) merged_table = DynamoTable(dynamo_client, merged_table_name) communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table, key_col, n_workers, worker_index) # Reading data from S3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type).ins_np data_type = dataset.dtype centroid_shape = (n_clusters, dataset.shape[1]) print("parse data cost {} s".format(time.time() - parse_start)) print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}". format(dataset_type, data_type, centroid_shape, n_features)) init_centroids_start = time.time() if worker_index == 0: centroids = dataset[0:n_clusters] merged_table.save(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1", key_col) else: centroid_bytes = (merged_table.load_or_wait( Prefix.KMeans_Init_Cent + "-1", key_col, 0.1))['value'].value centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type) if centroid_shape != centroids.shape: raise Exception("The shape of centroids does not match.") print("initialize centroids takes {} s".format(time.time() - init_centroids_start)) model = cluster_models.get_model(dataset, centroids, dataset_type, n_features, n_clusters) train_start = time.time() for epoch in range(n_epochs): epoch_start = time.time() # rearrange data points model.find_nearest_cluster() local_cent = model.get_centroids("numpy").reshape(-1) local_cent_error = np.concatenate( (local_cent.flatten(), np.array([model.error], dtype=np.float32))) epoch_cal_time = time.time() - epoch_start # sync local centroids and error epoch_comm_start = time.time() if sync_mode == "reduce": cent_error_merge = communicator.reduce_epoch( local_cent_error, epoch) elif sync_mode == "reduce_scatter": cent_error_merge = communicator.reduce_scatter_epoch( local_cent_error, epoch) cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float( n_workers) error_merge = cent_error_merge[-1] / float(n_workers) model.centroids = cent_merge model.error = error_merge epoch_comm_time = time.time() - epoch_comm_start print("one {} round cost {} s".format(sync_mode, epoch_comm_time)) print( "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s" .format(epoch, worker_index, model.error, time.time() - epoch_start, epoch_cal_time, epoch_comm_time)) if model.error < threshold: break if worker_index == 0: tmp_table.clear(key_col) merged_table.clear(key_col) print("Worker[{}] finishes training: Error = {}, cost {} s".format( worker_index, model.error, time.time() - train_start)) return
def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] assert dataset_type == "dense_libsvm" n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] tmp_table_name = event['tmp_table_name'] merged_table_name = event['merged_table_name'] key_col = event['key_col'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Linear_Models assert optim.lower() in Optimization.All assert sync_mode.lower() in Synchronization.All # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] shuffle_dataset = True random_seed = 100 print('bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() dynamo_client = dynamo_operator.get_client() tmp_table = DynamoTable(dynamo_client, tmp_table_name) merged_table = DynamoTable(dynamo_client, merged_table_name) communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table, key_col, n_workers, worker_index) # Read file from s3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) n_train_batch = len(train_loader) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = linear_models.get_model(model_name, n_features, n_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) train_start = time.time() # Training the Model for epoch in range(n_epochs): epoch_start = time.time() epoch_cal_time = 0 epoch_comm_time = 0 epoch_loss = 0 for batch_idx, (items, labels) in enumerate(train_loader): # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, n_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) epoch_loss += loss.item() loss.backward() if optim == "grad_avg": if sync_mode == "reduce" or sync_mode == "reduce_scatter": w_grad = model.linear.weight.grad.data.numpy() w_grad_shape = w_grad.shape b_grad = model.linear.bias.grad.data.numpy() b_grad_shape = b_grad.shape w_b_grad = np.concatenate( (w_grad.flatten(), b_grad.flatten())) batch_cal_time = time.time() - batch_start epoch_cal_time += batch_cal_time batch_comm_start = time.time() if sync_mode == "reduce": w_b_grad_merge = communicator.reduce_batch( w_b_grad, epoch, batch_idx) elif sync_mode == "reduce_scatter": w_b_grad_merge = communicator.reduce_scatter_batch( w_b_grad, epoch, batch_idx) w_grad_merge = w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]]\ .reshape(w_grad_shape) / float(n_workers) b_grad_merge = w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:]\ .reshape(b_grad_shape[0]) / float(n_workers) model.linear.weight.grad = Variable( torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable( torch.from_numpy(b_grad_merge)) batch_comm_time = time.time() - batch_comm_start print("one {} round cost {} s".format( sync_mode, batch_comm_time)) epoch_comm_time += batch_comm_time elif sync_mode == "async": # async does step before sync optimizer.step() w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape w_b = np.concatenate((w.flatten(), b.flatten())) batch_cal_time = time.time() - epoch_start epoch_cal_time += batch_cal_time batch_comm_start = time.time() # init model if worker_index == 0 and epoch == 0 and batch_idx == 0: merged_table.save(w_b.tobytes(), Prefix.w_b_prefix, key_col) w_b_merge = communicator.async_reduce( w_b, Prefix.w_b_prefix) # do not need average w_merge = w_b_merge[:w_shape[0] * w_shape[1]].reshape(w_shape) b_merge = w_b_merge[w_shape[0] * w_shape[1]:].reshape( b_shape[0]) model.linear.weight.data = torch.from_numpy(w_merge) model.linear.bias.data = torch.from_numpy(b_merge) batch_comm_time = time.time() - batch_comm_start print("one {} round cost {} s".format( sync_mode, batch_comm_time)) epoch_comm_time += batch_comm_time if sync_mode != "async": step_start = time.time() optimizer.step() epoch_cal_time += time.time() - step_start if batch_idx % 10 == 0: print( "Epoch: [%d/%d], Step: [%d/%d], Time: %.4f s, Loss: %.4f, batch cost %.4f s" % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, loss.item(), time.time() - batch_start)) if optim == "model_avg": w = model.linear.weight.data.numpy() w_shape = w.shape b = model.linear.bias.data.numpy() b_shape = b.shape w_b = np.concatenate((w.flatten(), b.flatten())) epoch_cal_time += time.time() - epoch_start epoch_comm_start = time.time() if sync_mode == "reduce": w_b_merge = communicator.reduce_epoch(w_b, epoch) elif sync_mode == "reduce_scatter": w_b_merge = communicator.reduce_scatter_epoch(w_b, epoch) elif sync_mode == "async": if worker_index == 0 and epoch == 0: merged_table.save(w_b.tobytes(), Prefix.w_b_prefix, key_col) w_b_merge = communicator.async_reduce(w_b, Prefix.w_b_prefix) w_merge = w_b_merge[:w_shape[0] * w_shape[1]].reshape(w_shape) b_merge = w_b_merge[w_shape[0] * w_shape[1]:].reshape(b_shape[0]) if sync_mode == "reduce" or sync_mode == "reduce_scatter": w_merge = w_merge / float(n_workers) b_merge = b_merge / float(n_workers) model.linear.weight.data = torch.from_numpy(w_merge) model.linear.bias.data = torch.from_numpy(b_merge) print("one {} round cost {} s".format( sync_mode, time.time() - epoch_comm_start)) epoch_comm_time += time.time() - epoch_comm_start if worker_index == 0: delete_start = time.time() # model avg delete by epoch if optim == "model_avg" and sync_mode != "async": communicator.delete_expired_epoch(epoch) elif optim == "grad_avg" and sync_mode != "async": communicator.delete_expired_batch(epoch, batch_idx) epoch_comm_time += time.time() - delete_start # Test the Model test_start = time.time() n_test_correct = 0 n_test = 0 test_loss = 0 for items, labels in validation_loader: items = Variable(items.view(-1, n_features)) labels = Variable(labels) outputs = model(items) test_loss += criterion(outputs, labels).data _, predicted = torch.max(outputs.data, 1) n_test += labels.size(0) n_test_correct += (predicted == labels).sum() test_time = time.time() - test_start print( 'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f: ' 'calculation cost = %.4f s, communication cost %.4f s, test cost %.4f s, ' 'accuracy of the model on the %d test samples: %d %%, loss = %f' % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_comm_time, test_time, n_test, 100. * n_test_correct / n_test, test_loss / n_test)) if worker_index == 0: tmp_table.clear(key_col) merged_table.clear(key_col) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() # dataset setting train_file = event['train_file'] test_file = event['test_file'] data_bucket = event['data_bucket'] n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] cp_bucket = event['cp_bucket'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Deep_Models assert optim.lower() in Optimization.All assert sync_mode.lower() in Synchronization.All # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] start_epoch = event['start_epoch'] run_epochs = event['run_epochs'] function_name = event['function_name'] print('data bucket = {}'.format(data_bucket)) print("train file = {}".format(train_file)) print("test file = {}".format(test_file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) print('start epoch = {}'.format(start_epoch)) print('run epochs = {}'.format(run_epochs)) print("Run function {}, round: {}/{}, epoch: {}/{} to {}/{}" .format(function_name, int(start_epoch/run_epochs) + 1, math.ceil(n_epochs / run_epochs), start_epoch + 1, n_epochs, start_epoch + run_epochs, n_epochs)) storage = S3Storage() communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index) # download file from s3 local_dir = "/tmp" read_start = time.time() storage.download(data_bucket, train_file, os.path.join(local_dir, train_file)) storage.download(data_bucket, test_file, os.path.join(local_dir, test_file)) print("download file from s3 cost {} s".format(time.time() - read_start)) train_set = torch.load(os.path.join(local_dir, train_file)) test_set = torch.load(os.path.join(local_dir, test_file)) train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') print("read data cost {} s".format(time.time() - read_start)) random_seed = 100 torch.manual_seed(random_seed) device = 'cpu' net = deep_models.get_models(model_name).to(device) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate) # load checkpoint model if it is not the first round if start_epoch != 0: checked_file = 'checkpoint_{}.pt'.format(start_epoch - 1) storage.download(cp_bucket, checked_file, os.path.join(local_dir, checked_file)) checkpoint_model = torch.load(os.path.join(local_dir, checked_file)) net.load_state_dict(checkpoint_model['model_state_dict']) optimizer.load_state_dict(checkpoint_model['optimizer_state_dict']) print("load checkpoint model at epoch {}".format(start_epoch - 1)) for epoch in range(start_epoch, min(start_epoch + run_epochs, n_epochs)): train_loss, train_acc = train_one_epoch(epoch, net, train_loader, optimizer, worker_index, communicator, optim, sync_mode) test_loss, test_acc = test(epoch, net, test_loader) print('Epoch: {}/{},'.format(epoch + 1, n_epochs), 'train loss: {}'.format(train_loss), 'train acc: {},'.format(train_acc), 'test loss: {}'.format(test_loss), 'test acc: {}.'.format(test_acc), ) if worker_index == 0: storage.clear(tmp_bucket) storage.clear(merged_bucket) # training is not finished yet, invoke next round if epoch < n_epochs - 1: checkpoint_model = { 'epoch': epoch, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': train_loss.average } checked_file = 'checkpoint_{}.pt'.format(epoch) if worker_index == 0: torch.save(checkpoint_model, os.path.join(local_dir, checked_file)) storage.upload_file(cp_bucket, checked_file, os.path.join(local_dir, checked_file)) print("checkpoint model at epoch {} saved!".format(epoch)) print("Invoking the next round of functions. round: {}/{}, start epoch: {}, run epoch: {}" .format(int((epoch + 1) / run_epochs) + 1, math.ceil(n_epochs / run_epochs), epoch + 1, run_epochs)) lambda_client = boto3.client('lambda') payload = { 'train_file': event['train_file'], 'test_file': event['test_file'], 'data_bucket': event['data_bucket'], 'n_features': event['n_features'], 'n_classes': event['n_classes'], 'n_workers': event['n_workers'], 'worker_index': event['worker_index'], 'tmp_bucket': event['tmp_bucket'], 'merged_bucket': event['merged_bucket'], 'cp_bucket': event['cp_bucket'], 'model': event['model'], 'optim': event['optim'], 'sync_mode': event['sync_mode'], 'lr': event['lr'], 'batch_size': event['batch_size'], 'n_epochs': event['n_epochs'], 'start_epoch': epoch + 1, 'run_epochs': event['run_epochs'], 'function_name': event['function_name'] } lambda_client.invoke(FunctionName=function_name, InvocationType='Event', Payload=json.dumps(payload)) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] host = event['host'] port = event['port'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Sparse_Linear_Models assert optim.lower() == Optimization.ADMM assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] n_admm_epochs = event['n_admm_epochs'] lam = event['lambda'] rho = event['rho'] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() mem_storage = MemcachedStorage(host, port) communicator = MemcachedCommunicator(mem_storage, tmp_bucket, merged_bucket, n_workers, worker_index) if worker_index == 0: mem_storage.clear() # Read file from s3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) shuffle_dataset = True random_seed = 100 if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # split train set and test set train_set = [dataset[i] for i in train_indices] n_train_batch = math.floor(len(train_set) / batch_size) val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = linear_models.get_sparse_model(model_name, train_set, val_set, n_features, n_epochs, learning_rate, batch_size) z, u = initialize_z_and_u(model.weight.data.size()) print("size of z = {}".format(z.shape)) print("size of u = {}".format(u.shape)) # Training the Model train_start = time.time() for admm_epoch in range(n_admm_epochs): print(">>> ADMM Epoch[{}]".format(admm_epoch + 1)) admm_epoch_start = time.time() admm_epoch_cal_time = 0 admm_epoch_comm_time = 0 admm_epoch_test_time = 0 for epoch in range(n_epochs): epoch_start = time.time() epoch_loss = 0. for batch_idx in range(n_train_batch): batch_start = time.time() batch_loss, batch_acc = model.one_batch() u_z = torch.from_numpy(u) - torch.from_numpy(z) new_grad = torch.add(model.weight, u_z).mul(rho) new_grad.mul_(-1.0 * learning_rate) model.weight.add_(new_grad) batch_loss = batch_loss.average + rho / 2.0 * torch.norm( model.weight + u_z, p=2).item() epoch_loss += batch_loss if batch_idx % 10 == 0: print( "ADMM Epoch: [{}/{}], Epoch: [{}/{}], Batch: [{}/{}], " "time: {:.4f} s, batch cost {:.4f} s, loss: {}, accuracy: {}" .format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, time.time() - batch_start, batch_loss, batch_acc)) epoch_cal_time = time.time() - epoch_start admm_epoch_cal_time += epoch_cal_time # Test the Model test_start = time.time() test_loss, test_acc = model.evaluate() epoch_test_time = time.time() - test_start admm_epoch_test_time += epoch_test_time print( "ADMM Epoch: [{}/{}] Epoch: [{}/{}] finishes, Batch: [{}/{}], " "Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, " "calculation cost = {:.4f} s, test cost {:.4f} s, " "accuracy of the model on the {} test samples: {}, loss = {}". format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_test_time, len(val_set), test_acc, test_loss)) sync_start = time.time() w = model.weight.numpy() w_shape = w.shape b = np.array([model.bias], dtype=np.float32) b_shape = b.shape u_shape = u.shape w_b = np.concatenate((w.flatten(), b.flatten())) u_w_b = np.concatenate((u.flatten(), w_b.flatten())) # admm does not support async if sync_mode == "reduce": u_w_b_merge = communicator.reduce_epoch(u_w_b, admm_epoch) elif sync_mode == "reduce_scatter": u_w_b_merge = communicator.reduce_scatter_epoch(u_w_b, admm_epoch) u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(n_workers) w_mean = u_w_b_merge[u_shape[0] * u_shape[1]: u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]]\ .reshape(w_shape) / float(n_workers) b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:]\ .reshape(b_shape[0]) / float(n_workers) model.weight = torch.from_numpy(w_mean) model.bias = torch.from_numpy(b_mean) admm_epoch_comm_time += time.time() - sync_start print("one {} round cost {} s".format(sync_mode, admm_epoch_comm_time)) if worker_index == 0: delete_start = time.time() communicator.delete_expired_epoch(admm_epoch) admm_epoch_comm_time += time.time() - delete_start # z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam) # stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho) # print("stop = {}".format(stop)) # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean) z = update_z(w_mean, u_mean, rho, n_workers, lam) u = u + model.weight.data.numpy() - z print( "ADMM Epoch[{}] finishes, cost {} s, cal cost {} s, comm cost {} s, test cost {} s" .format(admm_epoch, time.time() - admm_epoch_start, admm_epoch_cal_time, admm_epoch_comm_time, admm_epoch_test_time)) # Test the Model test_loss, test_acc = model.evaluate() print( "Train finish, cost {} s, accuracy of the model on the {} test samples = {}, loss = {}" .format(time.time() - train_start, len(val_set), test_acc, test_loss)) if worker_index == 0: mem_storage.clear() end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context): function_name = "lambda_core" # dataset setting dataset_name = 'cifar10' data_bucket = "cifar10dataset" n_features = 32 * 32 n_classes = 10 tmp_table_name = "tmp-params" merged_table_name = "merged-params" cp_bucket = "cp-model" key_col = "key" # training setting model = "mobilenet" # mobilenet or resnet optim = "grad_avg" # grad_avg or model_avg sync_mode = "reduce" # async, reduce or reduce_scatter n_workers = 10 # hyper-parameters lr = 0.01 batch_size = 256 n_epochs = 5 start_epoch = 0 run_epochs = 3 # clear dynamodb table s3_storage = S3Storage() s3_storage.clear(cp_bucket) dynamo_client = dynamo_operator.get_client() tmp_tb = DynamoTable(dynamo_client, tmp_table_name) merged_tb = DynamoTable(dynamo_client, tmp_table_name) tmp_tb.clear(key_col) merged_tb.clear(key_col) # lambda payload payload = dict() payload['dataset'] = dataset_name payload['data_bucket'] = data_bucket payload['n_features'] = n_features payload['n_classes'] = n_classes payload['n_workers'] = n_workers payload['tmp_table_name'] = tmp_table_name payload['merged_table_name'] = merged_table_name payload['key_col'] = key_col payload['cp_bucket'] = cp_bucket payload['model'] = model payload['optim'] = optim payload['sync_mode'] = sync_mode payload['lr'] = lr payload['batch_size'] = batch_size payload['n_epochs'] = n_epochs payload['start_epoch'] = start_epoch payload['run_epochs'] = run_epochs payload['function_name'] = function_name # invoke functions lambda_client = boto3.client('lambda') for i in range(n_workers): payload['worker_index'] = i payload['train_file'] = 'training_{}.pt'.format(i) payload['test_file'] = 'test.pt' lambda_client.invoke(FunctionName=function_name, InvocationType='Event', Payload=json.dumps(payload))
def handler(event, context): # dataset data_bucket = event['data_bucket'] file = event['file'] dataset_type = event["dataset_type"] assert dataset_type == "dense_libsvm" n_features = event['n_features'] host = event['host'] port = event['port'] tmp_bucket = event["tmp_bucket"] merged_bucket = event["merged_bucket"] # hyper-parameter n_clusters = event['n_clusters'] n_epochs = event["n_epochs"] threshold = event["threshold"] sync_mode = event["sync_mode"] n_workers = event["n_workers"] worker_index = event['worker_index'] assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('num clusters = {}'.format(n_clusters)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() mem_storage = MemcachedStorage(host, port) communicator = MemcachedCommunicator(mem_storage, tmp_bucket, merged_bucket, n_workers, worker_index) if worker_index == 0: mem_storage.clear() # Reading data from S3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) if dataset_type == "dense_libsvm": dataset = dataset.ins_np data_type = dataset.dtype centroid_shape = (n_clusters, dataset.shape[1]) elif dataset_type == "sparse_libsvm": dataset = dataset.ins_list first_entry = dataset[0].to_dense().numpy() data_type = first_entry.dtype centroid_shape = (n_clusters, first_entry.shape[1]) print("parse data cost {} s".format(time.time() - parse_start)) print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}". format(dataset_type, data_type, centroid_shape, n_features)) init_centroids_start = time.time() if worker_index == 0: if dataset_type == "dense_libsvm": centroids = dataset[0:n_clusters] elif dataset_type == "sparse_libsvm": centroids = sparse_centroid_to_numpy(dataset[0:n_clusters], n_clusters) mem_storage.save_v2(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1", merged_bucket) print("generate initial centroids takes {} s".format( time.time() - init_centroids_start)) else: centroid_bytes = mem_storage.load_or_wait_v2( Prefix.KMeans_Init_Cent + "-1", merged_bucket) centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type) if centroid_shape != centroids.shape: raise Exception("The shape of centroids does not match.") print("Waiting for initial centroids takes {} s".format( time.time() - init_centroids_start)) model = cluster_models.get_model(dataset, centroids, dataset_type, n_features, n_clusters) train_start = time.time() for epoch in range(n_epochs): epoch_start = time.time() # rearrange data points model.find_nearest_cluster() local_cent = model.get_centroids("numpy").reshape(-1) local_cent_error = np.concatenate( (local_cent.flatten(), np.array([model.error]))) epoch_cal_time = time.time() - epoch_start # sync local centroids and error epoch_sync_start = time.time() if sync_mode == "reduce": cent_error_merge = communicator.reduce_epoch( local_cent_error, epoch) elif sync_mode == "reduce_scatter": cent_error_merge = communicator.reduce_scatter_epoch( local_cent_error, epoch) cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float( n_workers) error_merge = cent_error_merge[-1] / float(n_workers) model.centroids = cent_merge model.error = error_merge print("one {} round cost {} s".format(sync_mode, time.time() - epoch_sync_start)) epoch_sync_time = time.time() - epoch_sync_start print( "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s" .format(epoch, worker_index, model.error, time.time() - epoch_start, epoch_cal_time, epoch_sync_time)) if model.error < threshold: break #if worker_index == 0: # mem_storage.clear() print("Worker[{}] finishes training: Error = {}, cost {} s".format( worker_index, model.error, time.time() - train_start)) return
def handler(event, context): start_time = time.time() # dataset setting file = event['file'] data_bucket = event['data_bucket'] dataset_type = event['dataset_type'] assert dataset_type == "sparse_libsvm" n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] host = event['host'] port = event['port'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Sparse_Linear_Models assert optim.lower() in [Optimization.Grad_Avg, Optimization.Model_Avg] assert sync_mode.lower() in Synchronization.All # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] valid_ratio = event['valid_ratio'] shuffle_dataset = True random_seed = 100 print('bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() mem_storage = MemcachedStorage(host, port) communicator = MemcachedCommunicator(mem_storage, tmp_bucket, merged_bucket, n_workers, worker_index) if worker_index == 0: mem_storage.clear() # Read file from s3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(valid_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # split train set and test set train_set = [dataset[i] for i in train_indices] n_train_batch = math.floor(len(train_set) / batch_size) val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s, dataset size = {}".format( time.time() - preprocess_start, dataset_size)) model = linear_models.get_sparse_model(model_name, train_set, val_set, n_features, n_epochs, learning_rate, batch_size) train_start = time.time() # Training the Model for epoch in range(n_epochs): epoch_start = time.time() epoch_cal_time = 0 epoch_comm_time = 0 epoch_loss = 0. for batch_idx in range(n_train_batch): batch_start = time.time() batch_loss, batch_acc = model.one_batch() epoch_loss += batch_loss.average if optim == "grad_avg": if sync_mode == "reduce" or sync_mode == "reduce_scatter": w_b = np.concatenate((model.weight.numpy().flatten(), np.array([model.bias], dtype=np.float32))) batch_cal_time = time.time() - batch_start epoch_cal_time += batch_cal_time print("batch cal cost {} s".format(batch_cal_time)) batch_comm_start = time.time() if sync_mode == "reduce": w_b_merge = communicator.reduce_batch( w_b, epoch, batch_idx) elif sync_mode == "reduce_scatter": w_b_merge = communicator.reduce_scatter_batch( w_b, epoch, batch_idx) w_merge = w_b_merge[:n_features] / float(n_workers) b_merge = w_b_merge[-1] / float(n_workers) model.weight = torch.from_numpy(w_merge).reshape( n_features, 1) model.bias = float(b_merge) batch_comm_time = time.time() - batch_comm_start print("one {} round cost {} s".format( sync_mode, batch_comm_time)) epoch_comm_time += batch_comm_time elif sync_mode == "async": w_b = np.concatenate((model.weight.numpy().flatten(), np.array([model.bias], dtype=np.float32))) batch_cal_time = time.time() - batch_start epoch_cal_time += batch_cal_time batch_comm_start = time.time() # init model if worker_index == 0 and epoch == 0 and batch_idx == 0: mem_storage.save_v2(w_b.tobytes(), Prefix.w_b_prefix, merged_bucket) w_b_merge = communicator.async_reduce( w_b, Prefix.w_b_prefix) # async des not need average w_merge = w_b_merge[:n_features] b_merge = w_b_merge[-1] model.weight = torch.from_numpy(w_merge).reshape( n_features, 1) model.bias = float(b_merge) batch_comm_time = time.time() - batch_comm_start print("one {} round cost {} s".format( sync_mode, batch_comm_time)) epoch_comm_time += batch_comm_time if batch_idx % 10 == 0: print( 'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f s, Loss: %.4f, Accuracy: %.4f, batch cost %.4f s' % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, batch_loss.average, batch_acc.accuracy, time.time() - batch_start)) if optim == "model_avg": w_b = np.concatenate((model.weight.numpy().flatten(), np.array([model.bias], dtype=np.float32))) epoch_cal_time += time.time() - epoch_start epoch_sync_start = time.time() if sync_mode == "reduce": w_b_merge = communicator.reduce_epoch(w_b, epoch) elif sync_mode == "reduce_scatter": w_b_merge = communicator.reduce_scatter_epoch(w_b, epoch) elif sync_mode == "async": if worker_index == 0 and epoch == 0: mem_storage.save_v2(w_b.tobytes(), Prefix.w_b_prefix, merged_bucket) w_b_merge = communicator.async_reduce(w_b, Prefix.w_b_prefix) w_merge = w_b_merge[:n_features] b_merge = w_b_merge[-1] # async des not need average if sync_mode == "reduce" or sync_mode == "reduce_scatter": w_merge = w_merge / float(n_workers) b_merge = b_merge / float(n_workers) model.weight = torch.from_numpy(w_merge).reshape(n_features, 1) model.bias = float(b_merge) print("one {} round cost {} s".format( sync_mode, time.time() - epoch_sync_start)) epoch_comm_time += time.time() - epoch_sync_start if worker_index == 0: delete_start = time.time() # model avg delete by epoch if optim == "model_avg" and sync_mode != "async": communicator.delete_expired_epoch(epoch) elif optim == "grad_avg" and sync_mode != "async": communicator.delete_expired_batch(epoch, batch_idx) epoch_comm_time += time.time() - delete_start # Test the Model test_start = time.time() test_loss, test_acc = model.evaluate() test_time = time.time() - test_start print( "Epoch: [{}/{}] finishes, Batch: [{}/{}], Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, " "calculation cost = {:.4f} s, synchronization cost {:.4f} s, test cost {:.4f} s, " "accuracy of the model on the {} test samples: {}, loss = {}". format(epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, epoch_loss, time.time() - epoch_start, epoch_cal_time, epoch_comm_time, test_time, len(val_set), test_acc.accuracy, test_loss.average)) if worker_index == 0: mem_storage.clear() end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))
def test_copy_with_error(storage_engine): another_engine = S3Storage(_TEST_CONTEXT, "another/path", "anotherbucket", "foo", "bar") with pytest.raises(IOError): storage_engine.copy_to(another_engine, _TEST_PATH)
def handler(event, context): start_time = time.time() # dataset setting train_file = event['train_file'] test_file = event['test_file'] data_bucket = event['data_bucket'] n_features = event['n_features'] n_classes = event['n_classes'] n_workers = event['n_workers'] worker_index = event['worker_index'] cp_bucket = event['cp_bucket'] # ps setting host = event['host'] port = event['port'] # training setting model_name = event['model'] optim = event['optim'] sync_mode = event['sync_mode'] assert model_name.lower() in MLModel.Deep_Models assert optim.lower() in Optimization.Grad_Avg assert sync_mode.lower() in Synchronization.Reduce # hyper-parameter learning_rate = event['lr'] batch_size = event['batch_size'] n_epochs = event['n_epochs'] start_epoch = event['start_epoch'] run_epochs = event['run_epochs'] function_name = event['function_name'] print('data bucket = {}'.format(data_bucket)) print("train file = {}".format(train_file)) print("test file = {}".format(test_file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('model = {}'.format(model_name)) print('optimization = {}'.format(optim)) print('sync mode = {}'.format(sync_mode)) print('start epoch = {}'.format(start_epoch)) print('run epochs = {}'.format(run_epochs)) print('host = {}'.format(host)) print('port = {}'.format(port)) print("Run function {}, round: {}/{}, epoch: {}/{} to {}/{}".format( function_name, int(start_epoch / run_epochs) + 1, math.ceil(n_epochs / run_epochs), start_epoch + 1, n_epochs, start_epoch + run_epochs, n_epochs)) # download file from s3 storage = S3Storage() local_dir = "/tmp" read_start = time.time() storage.download(data_bucket, train_file, os.path.join(local_dir, train_file)) storage.download(data_bucket, test_file, os.path.join(local_dir, test_file)) print("download file from s3 cost {} s".format(time.time() - read_start)) train_set = torch.load(os.path.join(local_dir, train_file)) test_set = torch.load(os.path.join(local_dir, test_file)) train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) n_train_batch = len(train_loader) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') print("read data cost {} s".format(time.time() - read_start)) random_seed = 100 torch.manual_seed(random_seed) device = 'cpu' model = deep_models.get_models(model_name).to(device) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # load checkpoint model if it is not the first round if start_epoch != 0: checked_file = 'checkpoint_{}.pt'.format(start_epoch - 1) storage.download(cp_bucket, checked_file, os.path.join(local_dir, checked_file)) checkpoint_model = torch.load(os.path.join(local_dir, checked_file)) model.load_state_dict(checkpoint_model['model_state_dict']) optimizer.load_state_dict(checkpoint_model['optimizer_state_dict']) print("load checkpoint model at epoch {}".format(start_epoch - 1)) # Set thrift connection # Make socket transport = TSocket.TSocket(host, port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TBinaryProtocol.TBinaryProtocol(transport) # Create a client to use the protocol encoder t_client = ParameterServer.Client(protocol) # Connect! transport.open() # test thrift connection ps_client.ping(t_client) print("create and ping thrift server >>> HOST = {}, PORT = {}".format( host, port)) # register model parameter_shape = [] parameter_length = [] model_length = 0 for param in model.parameters(): tmp_shape = 1 parameter_shape.append(param.data.numpy().shape) for w in param.data.numpy().shape: tmp_shape *= w parameter_length.append(tmp_shape) model_length += tmp_shape ps_client.register_model(t_client, worker_index, model_name, model_length, n_workers) ps_client.exist_model(t_client, model_name) print("register and check model >>> name = {}, length = {}".format( model_name, model_length)) # Training the Model train_start = time.time() iter_counter = 0 for epoch in range(start_epoch, min(start_epoch + run_epochs, n_epochs)): model.train() epoch_start = time.time() train_acc = Accuracy() train_loss = Average() for batch_idx, (inputs, targets) in enumerate(train_loader): batch_start = time.time() batch_cal_time = 0 batch_comm_time = 0 # pull latest model ps_client.can_pull(t_client, model_name, iter_counter, worker_index) latest_model = ps_client.pull_model(t_client, model_name, iter_counter, worker_index) pos = 0 for layer_index, param in enumerate(model.parameters()): param.data = Variable( torch.from_numpy( np.asarray(latest_model[pos:pos + parameter_length[layer_index]], dtype=np.float32).reshape( parameter_shape[layer_index]))) pos += parameter_length[layer_index] batch_comm_time += time.time() - batch_start batch_cal_start = time.time() outputs = model(inputs) loss = F.cross_entropy(outputs, targets) optimizer.zero_grad() loss.backward() # flatten and concat gradients of weight and bias param_grad = np.zeros((1)) for param in model.parameters(): # print("shape of layer = {}".format(param.data.numpy().flatten().shape)) param_grad = np.concatenate( (param_grad, param.data.numpy().flatten())) param_grad = np.delete(param_grad, 0) #print("model_length = {}".format(param_grad.shape)) batch_cal_time += time.time() - batch_cal_start # push gradient to PS batch_push_start = time.time() ps_client.can_push(t_client, model_name, iter_counter, worker_index) ps_client.push_grad(t_client, model_name, param_grad, -1. * learning_rate / n_workers, iter_counter, worker_index) ps_client.can_pull(t_client, model_name, iter_counter + 1, worker_index) # sync all workers batch_comm_time += time.time() - batch_push_start train_acc.update(outputs, targets) train_loss.update(loss.item(), inputs.size(0)) optimizer.step() iter_counter += 1 if batch_idx % 10 == 0: print( 'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, ' 'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s' % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch, time.time() - train_start, loss.item(), time.time() - epoch_start, time.time() - batch_start, batch_cal_time, batch_comm_time)) test_loss, test_acc = test(epoch, model, test_loader) print( 'Epoch: {}/{},'.format(epoch + 1, n_epochs), 'train loss: {},'.format(train_loss), 'train acc: {},'.format(train_acc), 'test loss: {},'.format(test_loss), 'test acc: {}.'.format(test_acc), ) # training is not finished yet, invoke next round if epoch < n_epochs - 1: checkpoint_model = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': train_loss.average } checked_file = 'checkpoint_{}.pt'.format(epoch) if worker_index == 0: torch.save(checkpoint_model, os.path.join(local_dir, checked_file)) storage.upload(cp_bucket, checked_file, os.path.join(local_dir, checked_file)) print("checkpoint model at epoch {} saved!".format(epoch)) print( "Invoking the next round of functions. round: {}/{}, start epoch: {}, run epoch: {}" .format( int((epoch + 1) / run_epochs) + 1, math.ceil(n_epochs / run_epochs), epoch + 1, run_epochs)) lambda_client = boto3.client('lambda') payload = { 'train_file': event['train_file'], 'test_file': event['test_file'], 'data_bucket': event['data_bucket'], 'n_features': event['n_features'], 'n_classes': event['n_classes'], 'n_workers': event['n_workers'], 'worker_index': event['worker_index'], 'cp_bucket': event['cp_bucket'], 'host': event['host'], 'port': event['port'], 'model': event['model'], 'optim': event['optim'], 'sync_mode': event['sync_mode'], 'lr': event['lr'], 'batch_size': event['batch_size'], 'n_epochs': event['n_epochs'], 'start_epoch': epoch + 1, 'run_epochs': event['run_epochs'], 'function_name': event['function_name'] } lambda_client.invoke(FunctionName=function_name, InvocationType='Event', Payload=json.dumps(payload)) end_time = time.time() print("Elapsed time = {} s".format(end_time - start_time))