コード例 #1
0
def trainSQL0(file_name="SQL0",
              env=GridworldEnv(1),
              batch_size=128,
              gamma=0.999,
              beta=5,
              eps_start=0.9,
              eps_end=0.05,
              eps_decay=1000,
              is_plot=False,
              num_episodes=200,
              max_num_steps_per_episode=1000,
              learning_rate=0.0001,
              memory_replay_size=10000,
              n_step=10,
              target_update=10):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    """

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    target_model = DQN(input_size, num_actions)
    target_model.load_state_dict(model.state_dict())
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size, n_step, gamma)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0

    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
            -1, input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(model, target_model, state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, target_model, optimizer, memory, batch_size,
                           gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(
                    env.episode_total_reward
                )  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break
        if i_episode % target_update == 0 and i_episode != 0:
            target_model.load_state_dict(model.state_dict())

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results
    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations
コード例 #2
0
ファイル: trainingDQN.py プロジェクト: mxxhcm/code
def trainDQN(file_name="DQN",
             env=GridworldEnv(1),
             batch_size=128,
             gamma=0.999,
             eps_start=0.9,
             eps_end=0.05,
             eps_decay=1000,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.0001,
             memory_replay_size=10000):
    """
    DQN training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.title("")
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    model = DQN(num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []
    steps_done = 0  # total steps
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        print("Cur episode:", i_episode, "steps done:", steps_done,
                "exploration factor:", eps_end + (eps_start - eps_end) * \
                math.exp(-1. * steps_done / eps_decay))
        # Initialize the environment and state
        env.reset()
        # last_screen = env.current_grid_map
        # (1, 1, 8, 8)
        current_screen = get_screen(env)
        state = current_screen  # - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            steps_done += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma)
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-dqn-rewards', episode_rewards)
    np.save(file_name + '-dqn-durations', episode_durations)

    return model, episode_rewards, episode_durations
コード例 #3
0
class DQNAgent():
    """Deep Q-learning agent."""

    # def __init__(self,
    # env, device=DEVICE, summary_writer=writer,  # noqa
    # hyperparameters=DQN_HYPERPARAMS):  # noqa

    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    # Memory = namedtuple(
    # 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'],
    # verbose=False, rename=False)
    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

    def get_max_action(self, obs):
        '''
        Forward pass of the NN to obtain the action of the given observations
        '''
        # convert the observation in tensor
        state_t = torch.tensor(np.array([obs])).to(self.device)

        # forward pass
        q_values_t = self.online_network(state_t)

        # get the maximum value of the output (i.e. the best action to take)
        _, act_t = torch.max(q_values_t, dim=1)

        return int(act_t.item())

    def act(self, obs):
        '''
        Greedy action outputted by the NN in the CentralControl
        '''
        return self.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
        E-greedy action
        '''

        # In case of a noisy net, it takes a greedy action
        # if self.noisy_net:
        # return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def update_target_network(self):
        """Update target network weights with current online network values."""

        self.target_network.load_state_dict(self.online_network.state_dict())

    def set_optimizer(self, learning_rate):
        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=learning_rate)

    def sample_and_optimize(self, batch_size):
        '''
        Sample batch_size memories from the buffer and optimize them
        '''

        # This should be the part where it waits until it has enough
        # experience
        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            # l_loss = self.cc.optimize(mini_batch)
            l_loss = self.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.update_target_network()

    def optimize(self, mini_batch):
        '''
        Optimize the NN
        '''
        # reset the grads
        self.optimizer.zero_grad()
        # caluclate the loss of the mini batch
        loss = self._calulate_loss(mini_batch)
        loss_v = loss.item()

        # do backpropagation
        loss.backward()
        # one step of optimization
        self.optimizer.step()

        return loss_v

    def _calulate_loss(self, mini_batch):
        '''
        Calculate mini batch's MSE loss.
        It support also the double DQN version
        '''

        states, actions, next_states, rewards, dones = mini_batch

        # convert the data in tensors
        states_t = torch.as_tensor(states, device=self.device)
        next_states_t = torch.as_tensor(next_states, device=self.device)
        actions_t = torch.as_tensor(actions, device=self.device)
        rewards_t = torch.as_tensor(rewards,
                                    dtype=torch.float32,
                                    device=self.device)

        done_t = torch.as_tensor(dones, dtype=torch.uint8,
                                 device=self.device)  # noqa

        # Value of the action taken previously (recorded in actions_v)
        # in state_t
        state_action_values = self.online_network(states_t).gather(
            1, actions_t[:, None]).squeeze(-1)

        # NB gather is a differentiable function

        # Next state value with Double DQN. (i.e. get the value predicted
        # by the target nn, of the best action predicted by the online nn)
        if self.double_DQN:
            double_max_action = self.online_network(next_states_t).max(1)[1]
            double_max_action = double_max_action.detach()
            target_output = self.target_network(next_states_t)

            # NB: [:,None] add an extra dimension
            next_state_values = torch.gather(
                target_output, 1, double_max_action[:, None]).squeeze(-1)

        # Next state value in the normal configuration
        else:
            next_state_values = self.target_network(next_states_t).max(1)[0]

        next_state_values = next_state_values.detach()  # No backprop

        # Use the Bellman equation
        expected_state_action_values = rewards_t + \
            (self.gamma**self.n_multi_step) * next_state_values

        # compute the loss
        return nn.MSELoss()(state_action_values, expected_state_action_values)

    def reset_stats(self):
        '''
        Reset the agent's statistics
        '''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
        Acquire a new feedback from the environment. The feedback is
        constituted by the new observation, the reward and the done boolean.
        '''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)

        # Append it to the replay buffer
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1

        # TODO check this...
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)

        self.total_reward += reward

    def print_info(self):
        '''
        Print information about the agent
        '''

        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)

        # TODO replace with proper logger
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer is not None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('epsilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
コード例 #4
0
class Agent:
    def __init__(
        self,
        state_size,
        action_size,
        n_agents,
        buffer_size: int = 1e5,
        batch_size: int = 256,
        gamma: float = 0.995,
        tau: float = 1e-3,
        learning_rate: float = 7e-4,
        update_every: int = 4,
    ):
        """
        Initialize DQN agent using the agent-experience buffer

        Args:
            state_size (int): Size of the state observation returned by the
                environment
            action_size (int): Action space size
            n_agents (int): Number of agents in the environment
            buffer_size (int): Desired total experience buffer size
            batch_size (int): Mini-batch size
            gamma (float): Discount factor
            tau (float): For soft update of target parameters
            learning_rate (float): Learning rate
            update_every (int): Number of steps before target network update
        """

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents

        # Q-Networks
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=learning_rate)
        self.memory = AgentReplayMemory(buffer_size, n_agents, state_size,
                                        device)

        self.t_step = 0

        self.update_every = update_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

    def step(self, states, actions, rewards, next_steps, done):

        self.memory.push_agent_actions(states, actions, rewards, next_steps,
                                       done)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if self.memory.at_capacity():
                experience = self.memory.sample(self.batch_size)
                self.learn(experience, self.gamma)

    def act(self, states, eps=0):
        states = torch.from_numpy(states).float().to(device)
        self.policy_net.eval()

        with torch.no_grad():
            action_values = self.policy_net(states)
        self.policy_net.train()

        r = np.random.random(size=self.n_agents)

        action_values = np.argmax(action_values.cpu().data.numpy(), axis=1)
        random_choices = np.random.randint(0,
                                           self.action_size,
                                           size=self.n_agents)

        return np.where(r > eps, action_values, random_choices)

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        criterion = torch.nn.MSELoss()
        self.policy_net.train()
        self.target_net.eval()

        # shape of output from the model (batch_size,action_dim) = (64,4)
        predicted_targets = self.policy_net(states).gather(1, actions)

        with torch.no_grad():
            labels_next = self.target_net(next_states).detach().max(
                1)[0].unsqueeze(1)

        # .detach() ->  Returns a new Tensor, detached from the current graph.
        labels = rewards + (gamma * labels_next * (1 - dones))

        loss = criterion(predicted_targets, labels).to(device)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.policy_net, self.target_net, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.

        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
コード例 #5
0
ファイル: train.py プロジェクト: egordon9dev/MineRL-Project
n_steps = 8
n_actions = env.action_space.n
img_height = 64
img_width = 64
policy_net = None
network_path = "target_net.pt"
if os.path.exists(network_path):
    policy_net = torch.load(network_path)
    print("successfully loaded existing network from file: " + network_path)
else:
    policy_net = DQN(img_height, img_width, n_actions)
target_net = DQN(img_height, img_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
memory = ReplayMemory(10000)

steps_done = 0
logfile = "train_log.txt"
with open(logfile, "w+") as f:
    f.write("CS4803 MineRL Project Logs:\n")
def append_log(s):
    with open(logfile, "a") as f:
        f.write(s + "\n")

def state_from_obs(obs):
    # get the camera image from the observation dict and convert the image
    # to the correct shape: (C, H, W)
    img = torch.tensor(obs["pov"] / 255.0, dtype=torch.float32)
    flat_img = torch.transpose(torch.transpose(img, 0, 1), 0, 2).reshape(64*64*3)
コード例 #6
0
ファイル: DQN.py プロジェクト: moustafa-7/DRL-in-120-days

def pick_action(observation, net):
   if(random.random() < epsilon):
        return random.randint(0, num_actions-1)

    action = torch.argmax(
        net(torch.tensor(observation).float().unsqueeze(0)))

    return action


net = DQN()
net.load_state_dict(torch.load("model.h5", map_location="cpu"))
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)
starttime = time.time()
buffer = collections.deque(maxlen=N)
lr = 1e-3


for i in range(num_episodes):
    observation = env.reset()
    observation = preprocess(observation)
    observation = [observation, observation, observation, observation]
    j = 0

    while(True):
        j += 1
        time.sleep(0.01 - ((time.time() - starttime) % 0.01))
        if j % 4:
コード例 #7
0
class AgentCartpole:
    def __init__(self, p):
        self.p = p
        self.target_dqn = DQN(self.p['HIDDEN_DIM'])
        self.eval_dqn = DQN(self.p['HIDDEN_DIM'])

        self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4])
        self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE'])

        try:
            self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            print("Data has been loaded successfully")
        except:
            print("No data existing")

    def act(self, state):
        r = random.random()

        if r > self.p['EPSILON']:
            x = torch.FloatTensor(state)
            q_value = self.eval_dqn(x)
            action = torch.argmax(q_value).item()
            return action
        else:
            action = random.randint(0, self.p['N_ACTIONS']-1)
            return action

    def learn(self):
        if self.memory.index < self.p['BATCH_SIZE']:
            return

        # Get the state dict from the saved date
        eval_dict = self.eval_dqn.state_dict()
        target_dict = self.eval_dqn.state_dict()

        # Updating the parameters of the target DQN
        for w in eval_dict:
            target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p['ALPHA'] * eval_dict[w]
        self.target_dqn.load_state_dict(target_dict)

        # Get a sample of size BATCH
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop(self.p['BATCH_SIZE'])

        # Update the treshold for the act() method if needed everytime the agent learn
        if self.p["EPSILON"] > self.p["EPSILON_MIN"]:
            self.p["EPSILON"] *= self.p["EPSILON_DECAY"]

        loss = nn.MSELoss()

        # Compute q values for the current evaluation
        q_eval = self.eval_dqn(batch_state).gather(1, batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]])

        # Compute the next state q values
        q_next = self.target_dqn(batch_next_state).detach()

        # Compute the targetted q values
        q_target = batch_reward + q_next.max(1)[0].reshape([self.p["BATCH_SIZE"]]) * self.p["GAMMA"]
        self.optimizer.zero_grad()
        l = loss(q_eval, q_target)
        l.backward()
        self.optimizer.step()

    def random(self):
        env = gym.make('CartPole-v1')
        env = env.unwrapped
        env.reset()
        rewards = []
        while True:
            env.render()
            action = env.action_space.pop(self.p['BATCH_SIZE'])
            observation, reward, done, info = env.step(action)
            rewards.append(reward)
            if done:
                break

        env.close()
        plt.ylabel("Rewards")
        plt.xlabel("Nb interactions")
        plt.plot(rewards)
        plt.grid()
        plt.show()

    def dqn_cartpole(self):
        env = gym.make('CartPole-v1')
        env = env.unwrapped
        rewards = []
        for i in range(self.p['N_EPISODE']):
            state = env.reset()
            rewards.append(0)
            for s in range(self.p['N_STEPS']):
                # env.render()
                action = self.act(state)
                n_state, reward, done, _ = env.step(action)
                if done:
                    reward = -1
                rewards[-1] += reward

                self.memory.push(state, action, n_state, reward, done)
                self.learn()
                state = n_state

            print('Episode : ', i, ', Rewards : ', rewards[-1])

            # Save the eval model after each episode
            torch.save(self.eval_dqn.state_dict(), "Model/eval_dqn.data")

        # Display result
        n = 50
        res = sum(([a]*n for a in [sum(rewards[i:i+n])//n for i in range(0,len(rewards),n)]), [])
        print(rewards)
        plt.ylabel("Rewards")
        plt.xlabel("Episode")
        plt.plot(rewards)
        plt.plot(res)
        plt.grid()
        plt.legend(['Rewards per episode', 'Last 50 runs average'])
        plt.show()
        env.close()
コード例 #8
0
class TrainNQL:
    def __init__(self, epi, cfg=dcfg, validation=False):
        #cpu or cuda
        torch.cuda.empty_cache()
        self.device = cfg.device  #torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_dim = cfg.proc_frame_size  #State dimensionality 84x84.
        self.state_size = cfg.state_size
        #self.t_steps= tsteps
        self.t_eps = cfg.t_eps
        self.minibatch_size = cfg.minibatch_size
        # Q-learning parameters
        self.discount = cfg.discount  #Discount factor.
        self.replay_memory = cfg.replay_memory
        self.bufferSize = cfg.bufferSize
        self.target_q = cfg.target_q
        self.validation = validation
        if (validation):
            self.episode = epi
        else:
            self.episode = int(epi) - 1
        self.cfg = cfg

        modelGray = 'results/ep' + str(self.episode) + '/modelGray.net'
        modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net'
        tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net'
        tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net'

        if os.path.exists(modelGray) and os.path.exists(modelDepth):
            print("Loading model")
            self.gray_policy_net = torch.load(modelGray).to(self.device)
            self.gray_target_net = torch.load(tModelGray).to(self.device)
            self.depth_policy_net = torch.load(modelDepth).to(self.device)
            self.depth_target_net = torch.load(tModelDepth).to(self.device)

        else:
            print("New model")
            self.gray_policy_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.gray_target_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        if not validation and self.target_q and self.episode % self.target_q == 0:
            print("cloning")
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        self.gray_target_net.load_state_dict(self.gray_target_net.state_dict())
        self.gray_target_net.eval()

        self.depth_target_net.load_state_dict(
            self.depth_target_net.state_dict())
        self.depth_target_net.eval()

        self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters())
        self.depth_optimizer = optim.RMSprop(
            self.depth_policy_net.parameters())
        self.memory = ReplayMemory(self.replay_memory)

    def get_tensor_from_image(self, file):
        convert = T.Compose([
            T.ToPILImage(),
            T.Resize((self.state_dim, self.state_dim),
                     interpolation=Image.BILINEAR),
            T.ToTensor()
        ])
        screen = Image.open(file)
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        screen = convert(screen).unsqueeze(0).to(self.device)
        return screen

    def get_data(self, episode, tsteps):
        #images=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device)
        #depths=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device)
        images = []
        depths = []
        dirname_rgb = 'dataset/RGB/ep' + str(episode)
        dirname_dep = 'dataset/Depth/ep' + str(episode)
        for step in range(tsteps):
            #proc_image=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device)
            #proc_depth=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device)
            proc_image = []
            proc_depth = []

            dirname_rgb = 'dataset/RGB/ep' + str(episode)
            dirname_dep = 'dataset/Depth/ep' + str(episode)
            for i in range(self.state_size):
                grayfile = dirname_rgb + '/image_' + str(step + 1) + '_' + str(
                    i + 1) + '.png'
                depthfile = dirname_dep + '/depth_' + str(
                    step + 1) + '_' + str(i + 1) + '.png'
                #proc_image[i] = self.get_tensor_from_image(grayfile)
                #proc_depth[i] = self.get_tensor_from_image(depthfile)
                proc_image.append(grayfile)
                proc_depth.append(depthfile)
            #images[step]=proc_image
            #depths[step]=proc_depth
            images.append(proc_image)
            depths.append(proc_depth)
        return images, depths

    def load_data(self):

        rewards = torch.load('files/reward_history.dat')
        actions = torch.load('files/action_history.dat')
        ep_rewards = torch.load('files/ep_rewards.dat')

        print("Loading images")

        best_scores = range(len(actions))
        buffer_selection_mode = 'default'

        if (buffer_selection_mode == 'success_handshake'):
            eps_values = []
            for i in range(len(actions)):

                hspos = 0
                hsneg = 0
                for step in range(len(actions[i])):
                    if (len(actions[i]) > 0):
                        if actions[i][step] == 3:
                            if rewards[i][step] > 0:
                                hspos = hspos + 1
                            elif rewards[i][step] == -0.1:
                                hsneg = hsneg + 1
                accuracy = float(((hspos) / (hspos + hsneg)))
                eps_values.append(accuracy)

            best_scores = np.argsort(eps_values)

        for i in best_scores:
            print('Ep: ', i + 1)
            dirname_gray = 'dataset/RGB/ep' + str(i + 1)
            dirname_dep = 'dataset/Depth/ep' + str(i + 1)
            files = []
            if (os.path.exists(dirname_gray)):
                files = os.listdir(dirname_gray)

            k = 0
            for file in files:
                if re.match(r"image.*\.png", file):
                    k = k + 1
            k = int(k / 8)
            while (k % 4 != 0):
                k = k - 1
            if (k > self.bufferSize):
                k = self.bufferSize
            print(k)

            #os.system("free -h")
            #with torch.no_grad():
            images, depths = self.get_data(i + 1, k)
            print("Loading done")

            for step in range(k - 1):
                #print(len(rewards),i)
                #print(len(rewards[i]), step)
                reward = self.cfg.neutral_reward
                if rewards[i][step] >= 1:
                    reward = self.cfg.hs_success_reward
                elif rewards[i][step] < 0:
                    reward = self.cfg.hs_fail_reward
                reward = torch.tensor([reward], device=self.device)
                action = torch.tensor([[actions[i][step]]],
                                      device=self.device,
                                      dtype=torch.long)
                #image = images[step].unsqueeze(0).to(self.device)
                #depth = depths[step].unsqueeze(0).to(self.device)
                #next_image = images[step+1].unsqueeze(0).to(self.device)
                #next_depth = depths[step+1].unsqueeze(0).to(self.device)
                image = images[step]
                depth = depths[step]
                next_image = images[step + 1]
                next_depth = depths[step + 1]
                self.memory.push(image, depth, action, next_image, next_depth,
                                 reward)
                #print("Memory size: ",getsizeof(self.memory))
                #torch.cuda.empty_cache()

    def train(self):
        if len(self.memory) < self.minibatch_size:
            return
        for i in range(0, len(self.memory), self.minibatch_size):
            #transitions = self.memory.sample(self.minibatch_size)
            transitions = self.memory.pull(self.minibatch_size)

            print('Batch train: ' + str(int(i / self.minibatch_size) + 1) +
                  "/" + str(int(len(self.memory) / self.minibatch_size) + 1))

            aux_transitions = []
            for t in transitions:
                proc_sgray = torch.Tensor(self.state_size, self.state_dim,
                                          self.state_dim).to(self.device)
                proc_sdepth = torch.Tensor(self.state_size, self.state_dim,
                                           self.state_dim).to(self.device)
                proc_next_sgray = torch.Tensor(self.state_size, self.state_dim,
                                               self.state_dim).to(self.device)
                proc_next_sdepth = torch.Tensor(self.state_size,
                                                self.state_dim,
                                                self.state_dim).to(self.device)
                count = 0
                for sgray, sdepth, next_sgray, next_sdepth in zip(
                        t.sgray, t.sdepth, t.next_sgray, t.next_sdepth):
                    proc_sgray[count] = self.get_tensor_from_image(sgray)
                    proc_sdepth[count] = self.get_tensor_from_image(sdepth)
                    proc_next_sgray[count] = self.get_tensor_from_image(
                        next_sgray)
                    proc_next_sdepth[count] = self.get_tensor_from_image(
                        next_sdepth)
                    count += 1

                proc_sgray = proc_sgray.unsqueeze(0).to(self.device)
                proc_sdepth = proc_sdepth.unsqueeze(0).to(self.device)
                proc_next_sgray = proc_next_sgray.unsqueeze(0).to(self.device)
                proc_next_sdepth = proc_next_sdepth.unsqueeze(0).to(
                    self.device)
                #('sgray','sdepth','action','next_sgray','next_sdepth','reward')
                one_transition = Transition(proc_sgray, proc_sdepth, t.action,
                                            proc_next_sgray, proc_next_sdepth,
                                            t.reward)
                aux_transitions.append(one_transition)
            transitions = aux_transitions

            # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
            # detailed explanation). This converts batch-array of Transitions
            # to Transition of batch-arrays.
            batch = Transition(*zip(*transitions))
            #print(batch.sgray)

            # Compute a mask of non-final states and concatenate the batch elements
            # (a final state would've been the one after which simulation ended)
            gray_non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, batch.next_sgray)),
                                               device=self.device,
                                               dtype=torch.bool)
            gray_non_final_next_states = torch.cat(
                [s for s in batch.next_sgray if s is not None])

            depth_non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, batch.next_sdepth)),
                                                device=self.device,
                                                dtype=torch.bool)
            depth_non_final_next_states = torch.cat(
                [s for s in batch.next_sdepth if s is not None])
            sgray_batch = torch.cat(batch.sgray)
            sdepth_batch = torch.cat(batch.sdepth)

            action_batch = torch.cat(batch.action)
            reward_batch = torch.cat(batch.reward)

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken. These are the actions which would've been taken
            # for each batch state according to policy_net
            sgray_action_values = self.gray_policy_net(sgray_batch).gather(
                1, action_batch)
            sdepth_action_values = self.depth_policy_net(sdepth_batch).gather(
                1, action_batch)

            # Compute V(s_{t+1}) for all next states.
            # Expected values of actions for non_final_next_states are computed based
            # on the "older" target_net; selecting their best reward with max(1)[0].
            # This is merged based on the mask, such that we'll have either the expected
            # state value or 0 in case the state was final.
            next_sgray_values = torch.zeros(self.minibatch_size,
                                            device=self.device)
            next_sgray_values[gray_non_final_mask] = self.gray_target_net(
                gray_non_final_next_states).max(1)[0].detach()

            next_sdepth_values = torch.zeros(self.minibatch_size,
                                             device=self.device)
            next_sdepth_values[depth_non_final_mask] = self.depth_target_net(
                depth_non_final_next_states).max(1)[0].detach()
            # Compute the expected Q values
            expected_sgray_action_values = (next_sgray_values *
                                            self.discount) + reward_batch
            expected_sdepth_action_values = (next_sdepth_values *
                                             self.discount) + reward_batch

            # Compute Huber loss
            gray_loss = F.smooth_l1_loss(
                sgray_action_values, expected_sgray_action_values.unsqueeze(1))
            depth_loss = F.smooth_l1_loss(
                sdepth_action_values,
                expected_sdepth_action_values.unsqueeze(1))

            # Optimize the model
            self.gray_optimizer.zero_grad()
            gray_loss.backward()
            for param in self.gray_policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.gray_optimizer.step()

            # Optimize the model
            self.depth_optimizer.zero_grad()
            depth_loss.backward()
            for param in self.depth_policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.depth_optimizer.step()
コード例 #9
0
ファイル: main.py プロジェクト: Lanxiaozhi/tianshou
def test_dqn(args=get_args()):
    env = make_atari_env(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    # make environments
    train_envs = SubprocVectorEnv(
        [lambda: make_atari_env(args) for _ in range(args.training_num)])
    test_envs = SubprocVectorEnv(
        [lambda: make_atari_env_watch(args) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # define model
    net = DQN(*args.state_shape, args.action_shape,
              args.device).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    # define policy
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       target_update_freq=args.target_update_freq)
    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)
    # replay buffer: `save_last_obs` and `stack_num` can be removed together
    # when you have enough RAM
    buffer = VectorReplayBuffer(args.buffer_size,
                                buffer_num=len(train_envs),
                                ignore_obs_next=True,
                                save_only_last_obs=True,
                                stack_num=args.frames_stack)
    # collector
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # log
    log_path = os.path.join(args.logdir, args.task, 'dqn')
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = BasicLogger(writer)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        if env.env.spec.reward_threshold:
            return mean_rewards >= env.spec.reward_threshold
        elif 'Pong' in args.task:
            return mean_rewards >= 20
        else:
            return False

    def train_fn(epoch, env_step):
        # nature DQN setting, linear decay in the first 1M steps
        if env_step <= 1e6:
            eps = args.eps_train - env_step / 1e6 * \
                (args.eps_train - args.eps_train_final)
        else:
            eps = args.eps_train_final
        policy.set_eps(eps)
        logger.write('train/eps', env_step, eps)

    def test_fn(epoch, env_step):
        policy.set_eps(args.eps_test)

    # watch agent's performance
    def watch():
        print("Setup test envs ...")
        policy.eval()
        policy.set_eps(args.eps_test)
        test_envs.seed(args.seed)
        if args.save_buffer_name:
            print(f"Generate buffer with size {args.buffer_size}")
            buffer = VectorReplayBuffer(args.buffer_size,
                                        buffer_num=len(test_envs),
                                        ignore_obs_next=True,
                                        save_only_last_obs=True,
                                        stack_num=args.frames_stack)
            collector = Collector(policy, test_envs, buffer)
            result = collector.collect(n_step=args.buffer_size)
            print(f"Save buffer into {args.save_buffer_name}")
            # Unfortunately, pickle will cause oom with 1M buffer size
            buffer.save_hdf5(args.save_buffer_name)
        else:
            print("Testing agent ...")
            test_collector.reset()
            result = test_collector.collect(n_episode=args.test_num,
                                            render=args.render)
        pprint.pprint(result)

    if args.watch:
        watch()
        exit(0)

    # test train_collector and start filling replay buffer
    train_collector.collect(n_step=args.batch_size * args.training_num)
    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.step_per_collect,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               logger=logger,
                               update_per_step=args.update_per_step,
                               test_in_train=False)

    pprint.pprint(result)
    watch()
コード例 #10
0
from network import Network, DQN
from memory import ReplayMemory

#%% hyper parameters
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 200  # e-greedy threshold decay
GAMMA = 0.8  # Q-learning discount factor
LR = 0.001  # NN optimizer learning rate
HIDDEN_LAYER = 256  # NN hidden layer size
BATCH_SIZE = 64  # Q-learning batch size

#%% DQN NETWORK ARCHITECTURE
model = DQN(4, 4, 4)
model.cuda()
optimizer = optim.Adam(model.parameters(), LR)

#%% SELECT ACTION USING GREEDY ALGORITHM
steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(
        -1. * steps_done / EPS_DECAY)
    steps_done += 1

    #print(state.shape)
    #print(eps_threshold)
    if sample > eps_threshold:
コード例 #11
0
from basic_game import Game, Direction
from dqn import DQNTrainer, next_epsilon
from network import DQN
from simple_play import display_game, print_screen
from tensor_helper import game2tensor

SLEEP = 0.1
SAVE_TIME = 1000
WIDTH = 20
HEIGHT = 10
DECAY_STEP = 2000
BATCH_SIZE = 1
SIDE = math.sqrt(WIDTH * HEIGHT)
MAX_DIST = math.ceil(math.sqrt(WIDTH**2 + HEIGHT**2))
policy_network = DQN(width=WIDTH, height=HEIGHT)
optimizer = optim.SGD(policy_network.parameters(), lr=1e-5)
POLICY_PATH = 'data/policy.pt'
if os.path.isfile(POLICY_PATH):
    policy_network = torch.load(POLICY_PATH)


class SnakeTrainer(DQNTrainer):
    def __init__(self, *args, **kwargs):
        super(SnakeTrainer, self).__init__(*args, **kwargs)
        self.snake_step_info = dict()

    def decide_epsilon(self, game: Game):
        snake_len = len(game.snake)
        decay = self.snake_step_info.get(snake_len, 1)
        epsilon, self.snake_step_info[snake_len] = next_epsilon(
            self.eps_start, self.eps_end, self.decay_step, decay)