def evaluate(individual, num_classes, num_epochs, batch_size, learning_rate):
    train_transform, test_transform = utils._data_transforms_cifar10()

    train_dataset = torchvision.datasets.CIFAR10(root='../../data',
                                                 train=True,
                                                 transform=train_transform)
    test_dataset = torchvision.datasets.CIFAR10(root='../../data',
                                                train=False,
                                                transform=test_transform)

    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        # pin_memory=True,
        # num_workers=2
    )
    test_loader = torch.utils.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=False,
        # pin_memory=True,
        # num_workers=2
    )

    structure = Network(individual.structure, [(3, 32), (32, 128), (128, 128)],
                        num_classes, (32, 32)).to(device)

    individual.size = utils.count_parameters_in_MB(structure)

    parameters = filter(lambda p: p.requires_grad, structure.parameters())

    cudnn.enabled = True
    cudnn.benchmark = True

    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(parameters,
                                lr=learning_rate,
                                momentum=0.9,
                                weight_decay=3e-4)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           num_epochs,
                                                           eta_min=0.0)

    best_acc = 0

    for epoch in range(num_epochs):
        print('epoch[{}/{}]:'.format(epoch + 1, num_epochs))
        train(train_loader, structure, criterion, optimizer)
        scheduler.step()
        valid_acc = test(test_loader, structure, criterion)
        print()
        if valid_acc > best_acc:
            best_acc = valid_acc

    individual.accuracy = best_acc
class Agent():
    def __init__(self, action_number, state_size, seed=0, gamma=0.99):

        self.action_number = action_number
        self.state_size = state_size
        self.targetNetwork = Network(self.state_size, self.action_number,
                                     seed).to(device)
        self.localNetwork = Network(self.state_size, self.action_number,
                                    seed).to(device)
        self.memoryBuffer = PrioritizedMemory(MAX_BUFFER_SIZE, BATCH_SIZE)
        self.current_step = 0
        self.gamma = gamma

        self.optimizer = optim.Adam(self.localNetwork.parameters(), lr=0.001)

    def choose_action(self, state, eps):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.localNetwork.eval()
        with torch.no_grad():
            action_values = self.localNetwork(state)
        self.localNetwork.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_number))

    def step(self, state, action, reward, next_state, done):

        self.memoryBuffer.add(state, action, reward, next_state, done)

        self.current_step += 1

        if self.current_step % ACTUALIZATION_INTERVAL == 0 and len(
                self.memoryBuffer) >= BATCH_SIZE:
            buffer_data = self.memoryBuffer.get_batch()
            self.learn(buffer_data)

    def learn(self, buffer_data):
        """
        learning using:
            Experience Replay
            Double DQLearning
            dueling DQLearning
            delayed update
        """

        output_indexes, IS_weights, states, actions, rewards, next_states, dones = buffer_data

        # double Q learning
        best_predicted_action_number = self.localNetwork(
            next_states).detach().max(1)[1].unsqueeze(1)
        predicted_action_value = self.targetNetwork(
            next_states).detach().gather(
                1, best_predicted_action_number.view(-1, 1))
        # y_j calculation
        output_action_value = rewards + predicted_action_value * self.gamma * (
            1 - dones)
        # expected values
        predicted_expected_action_value = self.localNetwork(states).gather(
            1, actions)

        # (y_j - expected value)**2

        #priority replay added last part *IS_WEIGHTS
        losses = F.mse_loss(predicted_expected_action_value,
                            output_action_value,
                            reduce=False) * IS_weights
        abs_error = losses + MIN_UPDATE
        self.memoryBuffer.update_batch(output_indexes, abs_error)

        self.optimizer.zero_grad()
        loss = losses.mean()
        loss.backward()
        self.optimizer.step()

        # updating target network
        for target_param, local_param in zip(self.targetNetwork.parameters(),
                                             self.localNetwork.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)