def create_sgd_optimizer(model): """Create the analog-aware optimizer. Args: model (nn.Module): model to be trained. """ optimizer = AnalogSGD(model.parameters(), lr=0.05) optimizer.regroup_param_groups(model) return optimizer
def create_sgd_optimizer(model, learning_rate): """Create the analog-aware optimizer. Args: model (nn.Module): model to be trained learning_rate (float): global parameter to define learning rate """ optimizer = AnalogSGD(model.parameters(), lr=learning_rate) optimizer.regroup_param_groups(model) return optimizer
def get_optimizer(self, learning_rate: float, model: Module) -> Optimizer: """Return the `Optimizer` for the experiment. Args: learning_rate: the learning rate used by the optimizer. model: the neural network to be trained. Returns: the optimizer to be used in the experiment. """ optimizer = AnalogSGD(model.parameters(), lr=learning_rate) optimizer.regroup_param_groups(model) return optimizer
def test_learning_rate_update(self): """Check the learning rate update is applied to tile.""" loss_func = mse_loss x_b = Tensor([[0.1, 0.2], [0.2, 0.4]]) y_b = Tensor([[0.3], [0.6]]) layer1 = self.get_layer(2, 3) layer2 = self.get_layer(3, 1) model = Sequential(layer1, layer2) if self.use_cuda: x_b = x_b.cuda() y_b = y_b.cuda() model = model.cuda() opt = AnalogSGD(model.parameters(), lr=0.5) opt.regroup_param_groups(model) opt.zero_grad() new_lr = 0.07 for param_group in opt.param_groups: param_group['lr'] = new_lr pred = model(x_b) loss = loss_func(pred, y_b) loss.backward() opt.step() if not layer1.analog_tile.get_analog_ctx().use_torch_update: self.assertAlmostEqual(layer1.analog_tile.get_learning_rate(), new_lr)
def train_model(model, loss_func, x_b, y_b): """Train the model.""" opt = AnalogSGD(model.parameters(), lr=0.1) opt.regroup_param_groups(model) epochs = 10 for _ in range(epochs): opt.zero_grad() pred = model(x_b) loss = loss_func(pred, y_b) loss.backward() opt.step()
def train_once(model, y_in, y_out, analog_if, use_cuda=False): """Train once.""" criterion = MSELoss() optimizer = AnalogSGD(model.parameters(), lr=0.5, momentum=0.0, nesterov=0.0) optimizer.regroup_param_groups(model) if analog_if: # why is this format so difference? # TODO: better use same state format as for native Pytorch's LSTM? if use_cuda: states = [ LSTMState( zeros(y_in.size()[1], model.hidden_size).cuda(), zeros(y_in.size()[1], model.hidden_size).cuda()) for _ in range(model.num_layers) ] else: states = [ LSTMState(zeros(y_in.size()[1], model.hidden_size), zeros(y_in.size()[1], model.hidden_size)) for _ in range(model.num_layers) ] else: if use_cuda: states = (zeros(model.num_layers, y_in.size()[1], model.hidden_size).cuda(), zeros(model.num_layers, y_in.size()[1], model.hidden_size).cuda()) else: states = (zeros(model.num_layers, y_in.size()[1], model.hidden_size), zeros(model.num_layers, y_in.size()[1], model.hidden_size)) for _ in range(2): optimizer.zero_grad() pred, _ = model(y_in, states) loss = criterion(pred.mean(axis=2, keepdim=True), y_out) loss.backward() optimizer.step() return pred.detach().cpu().numpy()
def test_learning_rate_update_fn(self): """Check the learning rate update is applied to tile.""" layer1 = self.get_layer(2, 3) layer2 = self.get_layer(3, 1) model = Sequential(layer1, layer2) if self.use_cuda: model = model.cuda() opt = AnalogSGD(model.parameters(), lr=0.5) opt.regroup_param_groups(model) opt.zero_grad() new_lr = 0.07 opt.set_learning_rate(new_lr) self.assertAlmostEqual(layer1.analog_tile.get_learning_rate(), new_lr) self.assertAlmostEqual(layer2.analog_tile.get_learning_rate(), new_lr)
def get_model_and_x(self): """Trains a simple model.""" # Prepare the datasets (input and expected output). x = Tensor([[0.1, 0.2, 0.4, 0.3], [0.2, 0.1, 0.1, 0.3]]) y = Tensor([[1.0, 0.5], [0.7, 0.3]]) # Define a single-layer network, using a constant step device type. rpu_config = self.get_rpu_config() rpu_config.forward.out_res = -1. # Turn off (output) ADC discretization. rpu_config.forward.w_noise_type = WeightNoiseType.ADDITIVE_CONSTANT rpu_config.forward.w_noise = 0.02 rpu_config.noise_model = PCMLikeNoiseModel(g_max=25.0) model = AnalogLinear(4, 2, bias=True, rpu_config=rpu_config) # Move the model and tensors to cuda if it is available. if self.use_cuda: x = x.cuda() y = y.cuda() model.cuda() # Define an analog-aware optimizer, preparing it for using the layers. opt = AnalogSGD(model.parameters(), lr=0.1) opt.regroup_param_groups(model) for _ in range(100): opt.zero_grad() # Add the training Tensor to the model (input). pred = model(x) # Add the expected output Tensor. loss = mse_loss(pred, y) # Run training (backward propagation). loss.backward() opt.step() return model, x
def main(): """Train a PyTorch GAN analog model to generate fake characters alla MNIST dataset.""" # Make sure the directory where to save the results exist. # Results include examples of the fake images generated. os.makedirs(RESULTS, exist_ok=True) torch.manual_seed(SEED) # Load MNIST dataset as tensors. dataloader = DataLoader( MNIST(PATH_DATASET, download=True, transform=transforms.ToTensor()), batch_size=BATCH_SIZE, shuffle=True, ) print(f'\n{datetime.now().time().replace(microsecond=0)} --- ' f'Started GAN Example') gen = Generator(Z_DIM).to(DEVICE) gen_opt = AnalogSGD(gen.parameters(), lr=LR) gen_opt.regroup_param_groups(gen) disc = Discriminator().to(DEVICE) disc_opt = AnalogSGD(disc.parameters(), lr=LR) disc_opt.regroup_param_groups(disc) print(RPU_CONFIG) print(gen) print(disc) criterion = nn.BCEWithLogitsLoss() training_loop(gen, disc, gen_opt, disc_opt, criterion, dataloader, N_EPOCHS, DISPLAY_STEP) show_animation_fake_images() print(f'{datetime.now().time().replace(microsecond=0)} --- ' f'Completed GAN Example')
def train_model(self, model, in_vectors, out_vectors): """Trains a model """ opt = AnalogSGD(model.parameters(), lr=0.1) for _ in range(10): opt.zero_grad() # Add the training Tensor to the model (input). pred_value = model(in_vectors) # Add the expected output Tensor. loss_value = mse_loss(pred_value, out_vectors) # Run training (backward propagation). loss_value.backward() opt.step() return loss_value
def train_once_bidir(model, y_in, y_out, analog_if): """Train once.""" criterion = MSELoss() optimizer = AnalogSGD(model.parameters(), lr=0.5, momentum=0.0, nesterov=0.0) batch_size = y_in.size()[1] if analog_if: states = model.get_zero_state(batch_size) else: states = None for _ in range(2): optimizer.zero_grad() pred, _ = model(y_in, states) loss = criterion(pred.mean(axis=2, keepdim=True), y_out) loss.backward() optimizer.step() return pred.detach().cpu().numpy()
from aihwkit.simulator.rpu_base import cuda # Prepare the datasets (input and expected output). x = Tensor([[0.1, 0.2, 0.4, 0.3], [0.2, 0.1, 0.1, 0.3]]) y = Tensor([[1.0, 0.5], [0.7, 0.3]]) # Define a single-layer network, using a constant step device type. rpu_config = SingleRPUConfig(device=ConstantStepDevice()) model = AnalogLinear(4, 2, bias=True, rpu_config=rpu_config) # Move the model and tensors to cuda if it is available. if cuda.is_compiled(): x = x.cuda() y = y.cuda() model.cuda() # Define an analog-aware optimizer, preparing it for using the layers. opt = AnalogSGD(model.parameters(), lr=0.1) opt.regroup_param_groups(model) for epoch in range(100): # Add the training Tensor to the model (input). pred = model(x) # Add the expected output Tensor. loss = mse_loss(pred, y) # Run training (backward propagation). loss.backward() opt.step() print('Loss error: {:.16f}'.format(loss))
def test_against_fp(self): """Test whether FP is same as is_perfect inference tile.""" # pylint: disable-msg=too-many-locals # Prepare the datasets (input and expected output). x = Tensor([[0.1, 0.2, 0.4, 0.3], [0.2, 0.1, 0.1, 0.3]]) y = Tensor([[1.0, 0.5], [0.7, 0.3]]) # Define a single-layer network, using a constant step device type. rpu_config = self.get_rpu_config() rpu_config.forward.is_perfect = True model_torch = Linear(4, 2, bias=True) model = AnalogLinear(4, 2, bias=True, rpu_config=rpu_config) model.set_weights(model_torch.weight, model_torch.bias) model_fp = AnalogLinear(4, 2, bias=True, rpu_config=FloatingPointRPUConfig()) model_fp.set_weights(model_torch.weight, model_torch.bias) self.assertTensorAlmostEqual(model.get_weights()[0], model_torch.weight) self.assertTensorAlmostEqual(model.get_weights()[0], model_fp.get_weights()[0]) # Move the model and tensors to cuda if it is available. if self.use_cuda: x = x.cuda() y = y.cuda() model.cuda() model_fp.cuda() model_torch.cuda() # Define an analog-aware optimizer, preparing it for using the layers. opt = AnalogSGD(model.parameters(), lr=0.1) opt_fp = AnalogSGD(model_fp.parameters(), lr=0.1) opt_torch = SGD(model_torch.parameters(), lr=0.1) for _ in range(100): # inference opt.zero_grad() pred = model(x) loss = mse_loss(pred, y) loss.backward() opt.step() # same for fp opt_fp.zero_grad() pred_fp = model_fp(x) loss_fp = mse_loss(pred_fp, y) loss_fp.backward() opt_fp.step() # same for torch opt_torch.zero_grad() pred_torch = model_torch(x) loss_torch = mse_loss(pred_torch, y) loss_torch.backward() opt_torch.step() self.assertTensorAlmostEqual(pred_torch, pred) self.assertTensorAlmostEqual(loss_torch, loss) self.assertTensorAlmostEqual(model.get_weights()[0], model_torch.weight) self.assertTensorAlmostEqual(pred_fp, pred) self.assertTensorAlmostEqual(loss_fp, loss) self.assertTensorAlmostEqual(model.get_weights()[0], model_fp.get_weights()[0])
y_in = torch.stack(y_in_2d, dim=0).transpose(0, 1).unsqueeze(2) y_out = torch.stack(y_out_2d, dim=0).transpose(0, 1).unsqueeze(2) if WITH_EMBEDDING: if WITH_BIDIR: model = AnalogBidirRNNNetwork() else: model = AnalogRNNNetwork() else: if WITH_BIDIR: model = AnalogBidirRNNNetwork_noEmbedding() else: model = AnalogRNNNetwork_noEmbedding() model = model.to(DEVICE) optimizer = AnalogSGD(model.parameters(), lr=LEARNING_RATE) optimizer.regroup_param_groups(model) criterion = nn.MSELoss() # train losses = [] for i in range(EPOCHS): optimizer.zero_grad() pred, states = model(y_in, None) loss = criterion(pred, y_out) print('Epoch = %d: Train Perplexity = %f' % (i, np.exp(loss.detach().cpu().numpy()))) loss.backward()
def test_decay(self): """Test hidden parameter set.""" # pylint: disable=invalid-name, too-many-locals lifetime = 100. # initial setting (needs to be larger 1) gamma = 0.1 reset_bias = 0.1 # decay shift rpu_config = self.get_transfer_compound(gamma=gamma, lifetime=lifetime, lifetime_dtod=0.0, reset=reset_bias) model = self.get_layer(in_features=2, out_features=1, rpu_config=rpu_config) weight, bias = model.get_weights() model.set_weights(weight * 0.0, bias * 0.0 if bias is not None else None) params = model.analog_tile.get_hidden_parameters() shape = params['hidden_weights_0_0'].shape # just dummy settings a, b, c, d = 0.47, 0.21, 0.64, 0.12 params['hidden_weights_0_0'] = a * ones(*shape) # A params['hidden_weights_1_0'] = b * ones(*shape) # A ref params['hidden_weights_0_1'] = c * ones(*shape) # C params['hidden_weights_1_1'] = d * ones(*shape) # C_ref # explicitly set the decay scales (which is 1-1/lifetime) a_dcy, b_dcy, c_dcy, d_dcy = 0.95, 0.78, 0.93, 0.92 params['decay_scales_0_0'] = a_dcy * ones(*shape) # A params['decay_scales_1_0'] = b_dcy * ones(*shape) # A ref params['decay_scales_0_1'] = c_dcy * ones(*shape) # C params['decay_scales_1_1'] = d_dcy * ones(*shape) # C_ref model.analog_tile.set_hidden_parameters(params) # LR set to zero. Only lifetime will be applied opt = AnalogSGD(model.parameters(), lr=0.0) x_b = Tensor([[0.1, 0.2], [0.2, 0.4]]) y_b = Tensor([[0.3], [0.6]]) if self.use_cuda: model = model.cuda() x_b = x_b.cuda() y_b = y_b.cuda() epochs = 2 for _ in range(epochs): opt.zero_grad() pred = model(x_b) loss = mse_loss(pred, y_b) loss.backward() opt.step() weight, bias = model.get_weights() # reference values a = (a - reset_bias) * pow(a_dcy, epochs) + reset_bias b = (b - reset_bias) * pow(b_dcy, epochs) + reset_bias c = (c - reset_bias) * pow(c_dcy, epochs) + reset_bias d = (d - reset_bias) * pow(d_dcy, epochs) + reset_bias if self.digital_bias: self.assertAlmostEqual(bias[0].item(), 0.0) if self.bias and not self.digital_bias: self.assertAlmostEqual(bias[0].item(), gamma * (a - b) + c - d, 5) self.assertAlmostEqual(weight[0][0].item(), gamma * (a - b) + c - d, 5)