Python DuelingDQNの例、DuelingDQN.DuelingDQN Pythonの例

コード例 #1

0

ファイルを表示

ファイル: app.py プロジェクト: datalearningpr/ReinforcementLearning-OpenAI-Gym

def test(num=500):
    agent = DuelingDQN(env.observation_space.shape[0], env.action_space.n, e=0)
    batch_size = 32
    agent.load_model()
    steps = []
    for i_episode in range(num):
        old_observation = env.reset()
        old_action = env.action_space.sample()
        done = False
        step = 0
        while not done:
            step = step + 1
            # env.render()
            observation, reward, done, info = env.step(old_action)
            if done:
                reward = -200

            old_observation = observation
            old_action = agent.get_action(
                np.reshape(observation, [1, env.observation_space.shape[0]]))

            if done:
                steps.append(step)
                print("{}:{} steps".format(i_episode, step))
                break
        # if the average steps of consecutive 100 games is lower than a standard
        # we consider the method passes the game
        if len(steps) > 200 and sum(steps[-200:]) / 200 >= 195:
            print(sum(steps[-200:]) / 200)
            break

コード例 #2

0

ファイルを表示

ファイル: Agent.py プロジェクト: 3awny/PyTorch_LunarLanderEnvOpenAIGym_Dueling_Double_DeepQNetwork

    def __init__(self, gamma, epsilon, learning_rate, n_actions, input_dimensions, memory_size, size_of_batch, \
                 min_epsilon=0.01, decrement_epsilon=5e-7, replace=1000, mdl_checkpoint='temp/duelingDDQN'):

        self.gamma = gamma
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.n_actions = n_actions
        self.input_dimensions = input_dimensions
        self.size_of_batch = size_of_batch
        self.min_epsilon = min_epsilon
        self.decrement_epsilon = decrement_epsilon
        self.cnt_target_replace = replace
        self.mdl_checkpoint = mdl_checkpoint
        self.action_space = [i for i in range(self.n_actions)]
        self.memory = ExperienceReplay(memory_size, input_dimensions)
        self.counter_learn = 0

        # instance of the deep q-network - tell value of the current state
        self.q_network_eval = DuelingDQN(
            self.learning_rate,
            self.n_actions,
            input_dimensions=self.input_dimensions,
            name='lunarLanderDuelingDDQN_q_network_eval',
            mdl_checkpoint=self.mdl_checkpoint)
        # instance of the deep q-network - tell value of the next actions
        self.q_network_next = DuelingDQN(
            self.learning_rate,
            self.n_actions,
            input_dimensions=self.input_dimensions,
            name='lunarLanderDuelingDDQN_q_network_next',
            mdl_checkpoint=self.mdl_checkpoint)

コード例 #3

0

ファイルを表示

ファイル: DuelingAgent.py プロジェクト: sush1996/Rainbow-Pong

    def __init__(self, env, learning_rate, gamma, buffer_size, prioritized):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.prioritized = prioritized

        if self.prioritized == False:
            self.replay_buffer = BasicBuffer(max_size=buffer_size)
        else:
            self.replay_buffer = PrioritizedBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.model = DuelingDQN(env.observation_space.shape,
                                env.action_space.n).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

コード例 #4

0

ファイルを表示

ファイル: Agent.py プロジェクト: 3awny/PyTorch_LunarLanderEnvOpenAIGym_Dueling_Double_DeepQNetwork

class Agent:
    def __init__(self, gamma, epsilon, learning_rate, n_actions, input_dimensions, memory_size, size_of_batch, \
                 min_epsilon=0.01, decrement_epsilon=5e-7, replace=1000, mdl_checkpoint='temp/duelingDDQN'):

        self.gamma = gamma
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.n_actions = n_actions
        self.input_dimensions = input_dimensions
        self.size_of_batch = size_of_batch
        self.min_epsilon = min_epsilon
        self.decrement_epsilon = decrement_epsilon
        self.cnt_target_replace = replace
        self.mdl_checkpoint = mdl_checkpoint
        self.action_space = [i for i in range(self.n_actions)]
        self.memory = ExperienceReplay(memory_size, input_dimensions)
        self.counter_learn = 0

        # instance of the deep q-network - tell value of the current state
        self.q_network_eval = DuelingDQN(
            self.learning_rate,
            self.n_actions,
            input_dimensions=self.input_dimensions,
            name='lunarLanderDuelingDDQN_q_network_eval',
            mdl_checkpoint=self.mdl_checkpoint)
        # instance of the deep q-network - tell value of the next actions
        self.q_network_next = DuelingDQN(
            self.learning_rate,
            self.n_actions,
            input_dimensions=self.input_dimensions,
            name='lunarLanderDuelingDDQN_q_network_next',
            mdl_checkpoint=self.mdl_checkpoint)

    # choosing action
    def action_choice(self, observe):
        # exploitation
        if np.random.random() > self.epsilon:
            # convert observation to state tensor (PyTorch tensor), send it to our device and feed-forward
            # it through our network and get the advantage function out
            state = torch.tensor([observe], dtype=torch.float).to(
                self.q_network_eval.device)
            # we get the advantage function out. We don't care about the value function
            # as it is just a constant; does not affect anything hence _ is placed
            _, advntge = self.q_network_eval.forward(state)
            # the advantage function is used to calculate the maximal action
            # the argmax function returns a PyTorch tensor, which the OpenAI Gym will not accept as
            # input for its step function, so a numpy array is passed with the .item() function
            action = torch.argmax(advntge).item()
        # exploration
        else:
            action = np.random.choice(
                self.action_space)  # random choice from the action space
        return action

    # storing state-action transitions (interface with agent's memory)
    def transition_store(self, state, action, reward, new_state, flag_done):
        self.memory.transition_store(state, action, reward, new_state,
                                     flag_done)

    def target_network_replace(self):
        # counter_learn is how many times the agent executed the learning function
        if self.counter_learn % self.cnt_target_replace == 0:
            # loading the state dictionary from the evaluation network onto the Q next network
            self.q_network_next.load_state_dict(
                self.q_network_eval.state_dict())

    # linear epsilon decay
    def epsilon_decay(self):
        self.epsilon = self.epsilon - self.decrement_epsilon if self.epsilon > self.min_epsilon else self.min_epsilon

    # agent's saving network functionality
    def models_save(self):
        self.q_network_eval.checkpoint_save()
        self.q_network_next.checkpoint_save()

    # agent's loading network functionality
    def models_load(self):
        self.q_network_eval.checkpoint_load()
        self.q_network_next.checkpoint_load()

    # learning functionality
    def learn(self):
        # addressing if the agent hasn't filled up enough memory to preform learning
        # e.g. size of batch = 64 memory samples in each learning step. Lets say the agent
        # has so far only completed 10 steps or even 1 step. So there is not enough memory
        # yet to satisfy the set size of batch. We handle this by waiting until the agent fills
        # up its memory to the size of batch.
        if self.memory.memory_counter < self.size_of_batch:
            return

        # in pytorch the first thing you want to in a learning function is zeroing the gradience on the optimizer
        self.q_network_eval.optimizer.zero_grad()

        self.target_network_replace()

        # sampling of memory
        state, action, reward, next_state, flag_done = self.memory.buffer_sample(
            self.size_of_batch)

        # converting numpy arrays to pytorch tensors
        states = torch.tensor(state).to(self.q_network_eval.device)
        actions = torch.tensor(action).to(self.q_network_eval.device)
        flag_dones = torch.tensor(flag_done).to(self.q_network_eval.device)
        rewards = torch.tensor(reward).to(self.q_network_eval.device)
        next_states = torch.tensor(next_state).to(self.q_network_eval.device)

        # array from 0 to size_of_batch-1 that handles array indexing and slicing later on
        indices = np.arange(self.size_of_batch)

        # passing in states and next states to the respective networks
        value_s, advantage_s = self.q_network_eval.forward(states)
        value_s_new, advantage_s_new = self.q_network_next.forward(next_states)
        # this line comes from the methodology of the paper introducing Double Deep Q-learning
        value_s_eval, advantage_s_eval = self.q_network_eval.forward(
            next_states)
        # the former three quantities pairs are needed to perform the update rule
        # based on the paper introducing Double Deep Q-learning

        # dueling aggregation of value and advantage functions
        # In the paper introducing Dueling Deep Q-learning they settle on summing the value and advantage function
        # with normalizing by subtracting off the mean of the advantage stream. Summing them alone without this
        # normalization step will lead a problem called "identifiability", which is discussed in the report.

        # the array indexing on the line below, takes the indices of the size_of_batch as an array of indices and the
        # values of the actions the agent actually took by taking the actions sub-array
        q_network_pred = torch.add(
            value_s,
            (advantage_s - advantage_s.mean(dim=1, keepdim=True)))[indices,
                                                                   actions]
        # we perform the indexing below as we want it for all actions
        q_network_next = torch.add(
            value_s_new,
            (advantage_s_new - advantage_s_new.mean(dim=1, keepdim=True)))
        q_network_eval = torch.add(
            value_s_eval,
            (advantage_s_eval - advantage_s_eval.mean(dim=1, keepdim=True)))

        # maximal actions of the next state according to the evaluation network
        maximum_actions = torch.argmax(q_network_eval, dim=1)

        # evaluating rewards for which the next state is terminal
        # does not value future states that are flagged as terminal
        q_network_next[flag_dones] = 0.0
        # quantity of the target value is q_network_next according to the evaluation network
        q_network_target = rewards + self.gamma * q_network_next[
            indices, maximum_actions]

        # calculation of the loss function
        loss = self.q_network_eval.loss_func(
            q_network_target, q_network_pred).to(self.q_network_eval.device)
        # back-propagation
        loss.backward()
        # stepping the optimiser
        self.q_network_eval.optimizer.step()
        # increment learn function counter
        self.counter_learn += 1
        # epsilon decay
        self.epsilon_decay()

コード例 #5

0

ファイルを表示

ファイル: DuelingAgent.py プロジェクト: sush1996/Rainbow-Pong

class DuelingAgent:
    def __init__(self, env, learning_rate, gamma, buffer_size, prioritized):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.prioritized = prioritized

        if self.prioritized == False:
            self.replay_buffer = BasicBuffer(max_size=buffer_size)
        else:
            self.replay_buffer = PrioritizedBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.model = DuelingDQN(env.observation_space.shape,
                                env.action_space.n).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def get_action(self, state, eps=0.20):
        #state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        if (np.random.randn() > eps):
            return self.env.action_space.sample()
        return action

    def tensor_states(self, states_list):
        states_tensor = torch.Tensor()

        for state in states_list:
            state_float = torch.FloatTensor(state).to(self.device)
            states_tensor = torch.cat((states_tensor, state_float))

        return states_tensor

    def compute_loss(self, batch):

        if self.prioritized == False:
            states, actions, rewards, next_states, dones = batch
        else:
            states, actions, rewards, next_states, dones = batch[0]

        states = self.tensor_states(states)
        next_states = self.tensor_states(next_states)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)

        dones = torch.FloatTensor(dones).to(self.device)

        curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1))
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

        loss = self.MSE_loss(curr_Q, expected_Q)

        return loss

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

コード例 #6

0

ファイルを表示

ファイル: train.py プロジェクト: jiangxkjohn/ICRA2020_RM_IHiter_Decision

import numpy as np
import os
from IHiterEnv.parameter import TP
from IHiterEnv.policy import RandomPolicy
from IHiterEnv.agent import TeamAction

MaxEpisode = 2000
MaxEpisodeSteps = 100000

if not os.path.exists(os.path.abspath('.') \
    + '/train_data'):
    train_file = os.path.abspath('.') \
    + '/train_data' + '/'
    os.mkdir(train_file)

RLBrain = DuelingDQN(train_dir=train_file)
env = ICRA_Env()
team_action = TeamAction()

with open(train_file + 'param.txt', 'w') as f:
    f.writelines([
        "learning rate : " + str(RLBrain.LearningRate) + '\n',
        "ReplaceTargetIter",
        str(RLBrain.ReplaceTargetIter) + '\n', "BatchSize",
        str(RLBrain.BatchSize) + '\n', "Epsilon",
        str(RLBrain.Epsilon) + '\n',
        "EpsilonMin : " + str(RLBrain.EpsilonMin) + '\n', "Gamma",
        str(RLBrain.Gamma)
    ])

コード例 #7

0

ファイルを表示

ファイル: run_CartPole-v0.py プロジェクト: gitfuwu/jjh

from DuelingDQN import DuelingDQN

env_name = 'CartPole-v0'
episodes = 3000
steps = 300
test = 10

lr = 0.0001
ini_epsilon = 0.3
decay_steps = 200000  # 调用agent中learn方法decay_steps后，ini_epsilon会衰减到0
replay_size = 100
gamma = 0.9
batch_size = 32
update_frequency = 10  # 训练update_frequency次，更新一次目标网络参数
env = gym.make(env_name)
agent = DuelingDQN(env, lr, ini_epsilon, decay_steps, replay_size, gamma,
                   batch_size, update_frequency)


def train():
    for episode in range(episodes):

        state = env.reset()
        for step in range(steps):
            action = agent.greedy_action(state)
            next_state, reward, done, _ = env.step(action)

            reward = -1 if done else 0.1
            agent.store_transition(state, action, reward, next_state, done)
            agent.learn()
            state = next_state
            if done: