def evaluate(individual, num_classes, num_epochs, batch_size, learning_rate): train_transform, test_transform = utils._data_transforms_cifar10() train_dataset = torchvision.datasets.CIFAR10(root='../../data', train=True, transform=train_transform) test_dataset = torchvision.datasets.CIFAR10(root='../../data', train=False, transform=test_transform) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, # pin_memory=True, # num_workers=2 ) test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=batch_size, shuffle=False, # pin_memory=True, # num_workers=2 ) structure = Network(individual.structure, [(3, 32), (32, 128), (128, 128)], num_classes, (32, 32)).to(device) individual.size = utils.count_parameters_in_MB(structure) parameters = filter(lambda p: p.requires_grad, structure.parameters()) cudnn.enabled = True cudnn.benchmark = True criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(parameters, lr=learning_rate, momentum=0.9, weight_decay=3e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=0.0) best_acc = 0 for epoch in range(num_epochs): print('epoch[{}/{}]:'.format(epoch + 1, num_epochs)) train(train_loader, structure, criterion, optimizer) scheduler.step() valid_acc = test(test_loader, structure, criterion) print() if valid_acc > best_acc: best_acc = valid_acc individual.accuracy = best_acc
class Agent(): def __init__(self, action_number, state_size, seed=0, gamma=0.99): self.action_number = action_number self.state_size = state_size self.targetNetwork = Network(self.state_size, self.action_number, seed).to(device) self.localNetwork = Network(self.state_size, self.action_number, seed).to(device) self.memoryBuffer = PrioritizedMemory(MAX_BUFFER_SIZE, BATCH_SIZE) self.current_step = 0 self.gamma = gamma self.optimizer = optim.Adam(self.localNetwork.parameters(), lr=0.001) def choose_action(self, state, eps): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.localNetwork.eval() with torch.no_grad(): action_values = self.localNetwork(state) self.localNetwork.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_number)) def step(self, state, action, reward, next_state, done): self.memoryBuffer.add(state, action, reward, next_state, done) self.current_step += 1 if self.current_step % ACTUALIZATION_INTERVAL == 0 and len( self.memoryBuffer) >= BATCH_SIZE: buffer_data = self.memoryBuffer.get_batch() self.learn(buffer_data) def learn(self, buffer_data): """ learning using: Experience Replay Double DQLearning dueling DQLearning delayed update """ output_indexes, IS_weights, states, actions, rewards, next_states, dones = buffer_data # double Q learning best_predicted_action_number = self.localNetwork( next_states).detach().max(1)[1].unsqueeze(1) predicted_action_value = self.targetNetwork( next_states).detach().gather( 1, best_predicted_action_number.view(-1, 1)) # y_j calculation output_action_value = rewards + predicted_action_value * self.gamma * ( 1 - dones) # expected values predicted_expected_action_value = self.localNetwork(states).gather( 1, actions) # (y_j - expected value)**2 #priority replay added last part *IS_WEIGHTS losses = F.mse_loss(predicted_expected_action_value, output_action_value, reduce=False) * IS_weights abs_error = losses + MIN_UPDATE self.memoryBuffer.update_batch(output_indexes, abs_error) self.optimizer.zero_grad() loss = losses.mean() loss.backward() self.optimizer.step() # updating target network for target_param, local_param in zip(self.targetNetwork.parameters(), self.localNetwork.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)