Beispiel #1
0
    def __init__(self, sess, state_size, action_size):
        self.sess = sess

        self.state_size = state_size
        self.action_size = action_size

        # hyper parameter
        self.batch_size = 32
        self.discount_factor = 0.99
        self.learning_rate = 0.00025

        # epsilon
        self.s_epsilon = 1.0
        self.e_epsilon = 0.01
        self.n_epsilon_decay = 100000
        self.epsilon = self.s_epsilon

        # replay buffer
        self.buffer = ReplayBuffer(50000)

        # place holder
        self.actions = tf.placeholder(tf.int32, shape=None)
        self.targets = tf.placeholder(tf.float32, shape=None)

        # network
        self.policy_net = DQN({})
        self.target_net = DQN({})
        self.sess.run(tf.global_variables_initializer())
        self.update_target_network()

        # optimizer
        self.loss_op, self.train_op = self._build_op()
Beispiel #2
0
    def __init__(self, fps=50):
        self.GeneralReward = False
        self.net = Network(150, 450, 150, 650)
        self.updateRewardA = 0
        self.updateRewardB = 0
        self.updateIter = 0
        self.lossA = 0
        self.lossB = 0
        self.restart = False
        self.iteration = 0
        self.AgentA = DQN()
        self.AgentB = DQN()

        # Testing
        self.net = Network(150, 450, 150, 650)
        self.NetworkA = self.net.network(
            300, ysource=80, Ynew=650)  # Network A
        self.NetworkB = self.net.network(
            200, ysource=650, Ynew=80)  # Network B

        pygame.init()
        self.BLACK = (0, 0, 0)

        self.myFontA = pygame.font.SysFont("Times New Roman", 25)
        self.myFontB = pygame.font.SysFont("Times New Roman", 25)
        self.myFontIter = pygame.font.SysFont('Times New Roman', 25)

        self.FPS = fps
        self.fpsClock = pygame.time.Clock()

        self.nextplayer = np.random.choice(['A', 'B'])
Beispiel #3
0
    def __init__(self):
        #init ROS node
        rospy.init_node('robot_control')

        #super calls to parent classes
        super(RobotController, self).__init__()

        #initializes the work with starting parameters
        self.network = DQN(.0003, .1, .25)
        self.network.start()
    def __init__(self, name) :
        self.name = name

        self.initializeProperties()
        self.QNetwork = DQN(self.imageSize, "QN", self.miniBatchSize)
        self.TDTarget = DQN(self.imageSize, "TD", self.miniBatchSize)
        self.sess = tf.Session()
        self.QNetwork.setSess(self.sess)
        self.TDTarget.setSess(self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.synchronise()
Beispiel #5
0
 def __init__(self, name, num_episodes=500):
     self.name = name
     self.num_episodes = num_episodes
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.env = gym.make(name).unwrapped
     self.env.reset()
     self.env_w = EnvWrapper(self.env, self.device)
     self.cfg = Config()
     self.cfg.n_actions = self.env.action_space.n
     self.cfg.policy_net = DQN(self.env_w.screen_height, self.env_w.screen_width,
                               self.cfg.n_actions).to(self.device)
     self.cfg.target_net = DQN(self.env_w.screen_height, self.env_w.screen_width,
                               self.cfg.n_actions).to(self.device)
     self.agent = Agent(self.env, self.env_w, self.device, self.cfg)
    def __init__(self, p):
        self.p = p
        self.target_dqn = DQN(self.p['HIDDEN_DIM'])
        self.eval_dqn = DQN(self.p['HIDDEN_DIM'])

        self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4])
        self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE'])

        try:
            self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            print("Data has been loaded successfully")
        except:
            print("No data existing")
Beispiel #7
0
    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']
Beispiel #8
0
    def __init__(self, name, isBot):
        self.name = name
        self.isBot = isBot
        if not self.isBot:
            self.chosenAction = 0
            self.defineKeyboardListener()

        self.initializeProperties()
        self.QNetwork = DQN("QN{}".format(name), self.miniBatchSize)
        self.TDTarget = DQN("TD{}".format(name), self.miniBatchSize)
        self.sess = tf.Session()
        self.QNetwork.setSess(self.sess)
        self.TDTarget.setSess(self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.synchronise()
Beispiel #9
0
    def __init__(
        self,
        state_size,
        action_size,
        n_agents,
        buffer_size: int = 1e5,
        batch_size: int = 256,
        gamma: float = 0.995,
        tau: float = 1e-3,
        learning_rate: float = 7e-4,
        update_every: int = 4,
    ):
        """
        Initialize DQN agent using the agent-experience buffer

        Args:
            state_size (int): Size of the state observation returned by the
                environment
            action_size (int): Action space size
            n_agents (int): Number of agents in the environment
            buffer_size (int): Desired total experience buffer size
            batch_size (int): Mini-batch size
            gamma (float): Discount factor
            tau (float): For soft update of target parameters
            learning_rate (float): Learning rate
            update_every (int): Number of steps before target network update
        """

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents

        # Q-Networks
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=learning_rate)
        self.memory = AgentReplayMemory(buffer_size, n_agents, state_size,
                                        device)

        self.t_step = 0

        self.update_every = update_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
Beispiel #10
0
def test():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            time.sleep(0.3)

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
Beispiel #11
0
class RobotController(CmdVelPublisher, ImageSubscriber, object):
    """
    Brain of the Robot, which inherits from ImageSubscriber and cmdVelPublisher, and initializes a network.
    Determines from ImageSubscriber how to move, which is executed in cmdVelPublisher, and these movements are
    optimized and learned through self.network
    """
    def __init__(self):
        #init ROS node
        rospy.init_node('robot_control')

        #super calls to parent classes
        super(RobotController, self).__init__()

        #initializes the work with starting parameters
        self.network = DQN(.0003, .1, .25)
        self.network.start()

    def robot_control(self, action):
        """
        Given action, will exceute a specificed behavior from the robot
        action:
         0 = forward
         1 = leftTurn
         2 = rightTurn
         3 = stop
        """
        try:
            if action < 0 or action > 3:
                raise ValueError("Action is invalid")
            self.state[action].__call__()
        except:
            # make robot stop
            print "Invalid action - stopping robot"
            self.state[3].__call__()

        self.sendMessage()
        rospy.sleep(.1)  # use desired action for 0.1 second
        self.state[3].__call__()  # set robot to stop for .1 second
        self.sendMessage()
        rospy.sleep(.1)

    def run(self):
        """
        The main run loop
        """
        r = rospy.Rate(10)
        while not rospy.is_shutdown():
            if not self.cv_image is None:
                #visualizes the binary image
                cv2.imshow('video_window', self.binary_image)
                cv2.waitKey(5)
                #feeds binary image into network to receive action with corresponding Q-values
                a, Q = self.network.feed_forward(self.binary_image)
                #moves based on move probable action
                self.robot_control(a[0])
                #updates the network parameters based on what happened from the action step
                self.network.update(self.binary_image)
            r.sleep()

        self.network.stop()
Beispiel #12
0
    def __init__(self, state_size, action_size, seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Initialize Q-Networks
        self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)

        # Initialize optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialize optimizer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Beispiel #13
0
EPS_DECAY = 50000
TARGET_UPDATE = 10
LR = 0.005
test_time = False

n_steps = 8
n_actions = env.action_space.n
img_height = 64
img_width = 64
policy_net = None
network_path = "target_net.pt"
if os.path.exists(network_path):
    policy_net = torch.load(network_path)
    print("successfully loaded existing network from file: " + network_path)
else:
    policy_net = DQN(img_height, img_width, n_actions)
target_net = DQN(img_height, img_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
memory = ReplayMemory(10000)

steps_done = 0
logfile = "train_log.txt"
with open(logfile, "w+") as f:
    f.write("CS4803 MineRL Project Logs:\n")
def append_log(s):
    with open(logfile, "a") as f:
        f.write(s + "\n")

def state_from_obs(obs):
def trainD(file_name="Distral_2col_SQL",
           list_of_envs=[GridworldEnv(5),
                         GridworldEnv(4),
                         GridworldEnv(6)],
           batch_size=128,
           gamma=0.999,
           alpha=0.8,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Returns rewards and durations logs.
    """
    num_actions = list_of_envs[0].action_space.n
    input_size = list_of_envs[0].observation_space.shape[0]
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(input_size, num_actions)
    models = [DQN(input_size, num_actions) for _ in range(0, num_envs)]
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    states = []
    for env in list_of_envs:
        states.append(
            torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
                -1, input_size))

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alternating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):

            # select an action
            action = select_action(states[i_env], policy, models[i_env],
                                   num_actions, eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)

            steps_done[i_env] += 1
            current_time[i_env] += 1
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(states[i_env], action, next_state, reward,
                                 time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)

            # Update state
            states[i_env] = next_state

            # Check if agent reached target
            if done or current_time[i_env] >= max_num_steps_per_episode:
                if episodes_done[i_env] <= num_episodes:
                    print(
                        "ENV:", i_env, "iter:", episodes_done[i_env],
                        "\treward:{0:.2f}".format(env.episode_total_reward),
                        "\tit:", current_time[i_env], "\texp_factor:",
                        eps_end + (eps_start - eps_end) *
                        math.exp(-1. * episodes_done[i_env] / eps_decay))

                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy(env.reset()).type(
                    torch.FloatTensor).view(-1, input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        # Perform one step of the optimization on the Distilled policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()

    ## Store Results
    np.save(file_name + '-rewards', episode_rewards)
    np.save(file_name + '-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
Beispiel #15
0
class DQNAgent():
    """Deep Q-learning agent."""

    # def __init__(self,
    # env, device=DEVICE, summary_writer=writer,  # noqa
    # hyperparameters=DQN_HYPERPARAMS):  # noqa

    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    # Memory = namedtuple(
    # 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'],
    # verbose=False, rename=False)
    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

    def get_max_action(self, obs):
        '''
        Forward pass of the NN to obtain the action of the given observations
        '''
        # convert the observation in tensor
        state_t = torch.tensor(np.array([obs])).to(self.device)

        # forward pass
        q_values_t = self.online_network(state_t)

        # get the maximum value of the output (i.e. the best action to take)
        _, act_t = torch.max(q_values_t, dim=1)

        return int(act_t.item())

    def act(self, obs):
        '''
        Greedy action outputted by the NN in the CentralControl
        '''
        return self.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
        E-greedy action
        '''

        # In case of a noisy net, it takes a greedy action
        # if self.noisy_net:
        # return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def update_target_network(self):
        """Update target network weights with current online network values."""

        self.target_network.load_state_dict(self.online_network.state_dict())

    def set_optimizer(self, learning_rate):
        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=learning_rate)

    def sample_and_optimize(self, batch_size):
        '''
        Sample batch_size memories from the buffer and optimize them
        '''

        # This should be the part where it waits until it has enough
        # experience
        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            # l_loss = self.cc.optimize(mini_batch)
            l_loss = self.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.update_target_network()

    def optimize(self, mini_batch):
        '''
        Optimize the NN
        '''
        # reset the grads
        self.optimizer.zero_grad()
        # caluclate the loss of the mini batch
        loss = self._calulate_loss(mini_batch)
        loss_v = loss.item()

        # do backpropagation
        loss.backward()
        # one step of optimization
        self.optimizer.step()

        return loss_v

    def _calulate_loss(self, mini_batch):
        '''
        Calculate mini batch's MSE loss.
        It support also the double DQN version
        '''

        states, actions, next_states, rewards, dones = mini_batch

        # convert the data in tensors
        states_t = torch.as_tensor(states, device=self.device)
        next_states_t = torch.as_tensor(next_states, device=self.device)
        actions_t = torch.as_tensor(actions, device=self.device)
        rewards_t = torch.as_tensor(rewards,
                                    dtype=torch.float32,
                                    device=self.device)

        done_t = torch.as_tensor(dones, dtype=torch.uint8,
                                 device=self.device)  # noqa

        # Value of the action taken previously (recorded in actions_v)
        # in state_t
        state_action_values = self.online_network(states_t).gather(
            1, actions_t[:, None]).squeeze(-1)

        # NB gather is a differentiable function

        # Next state value with Double DQN. (i.e. get the value predicted
        # by the target nn, of the best action predicted by the online nn)
        if self.double_DQN:
            double_max_action = self.online_network(next_states_t).max(1)[1]
            double_max_action = double_max_action.detach()
            target_output = self.target_network(next_states_t)

            # NB: [:,None] add an extra dimension
            next_state_values = torch.gather(
                target_output, 1, double_max_action[:, None]).squeeze(-1)

        # Next state value in the normal configuration
        else:
            next_state_values = self.target_network(next_states_t).max(1)[0]

        next_state_values = next_state_values.detach()  # No backprop

        # Use the Bellman equation
        expected_state_action_values = rewards_t + \
            (self.gamma**self.n_multi_step) * next_state_values

        # compute the loss
        return nn.MSELoss()(state_action_values, expected_state_action_values)

    def reset_stats(self):
        '''
        Reset the agent's statistics
        '''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
        Acquire a new feedback from the environment. The feedback is
        constituted by the new observation, the reward and the done boolean.
        '''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)

        # Append it to the replay buffer
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1

        # TODO check this...
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)

        self.total_reward += reward

    def print_info(self):
        '''
        Print information about the agent
        '''

        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)

        # TODO replace with proper logger
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer is not None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('epsilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
class AgentCartpole:
    def __init__(self, p):
        self.p = p
        self.target_dqn = DQN(self.p['HIDDEN_DIM'])
        self.eval_dqn = DQN(self.p['HIDDEN_DIM'])

        self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4])
        self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE'])

        try:
            self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data"))
            print("Data has been loaded successfully")
        except:
            print("No data existing")

    def act(self, state):
        r = random.random()

        if r > self.p['EPSILON']:
            x = torch.FloatTensor(state)
            q_value = self.eval_dqn(x)
            action = torch.argmax(q_value).item()
            return action
        else:
            action = random.randint(0, self.p['N_ACTIONS']-1)
            return action

    def learn(self):
        if self.memory.index < self.p['BATCH_SIZE']:
            return

        # Get the state dict from the saved date
        eval_dict = self.eval_dqn.state_dict()
        target_dict = self.eval_dqn.state_dict()

        # Updating the parameters of the target DQN
        for w in eval_dict:
            target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p['ALPHA'] * eval_dict[w]
        self.target_dqn.load_state_dict(target_dict)

        # Get a sample of size BATCH
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop(self.p['BATCH_SIZE'])

        # Update the treshold for the act() method if needed everytime the agent learn
        if self.p["EPSILON"] > self.p["EPSILON_MIN"]:
            self.p["EPSILON"] *= self.p["EPSILON_DECAY"]

        loss = nn.MSELoss()

        # Compute q values for the current evaluation
        q_eval = self.eval_dqn(batch_state).gather(1, batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]])

        # Compute the next state q values
        q_next = self.target_dqn(batch_next_state).detach()

        # Compute the targetted q values
        q_target = batch_reward + q_next.max(1)[0].reshape([self.p["BATCH_SIZE"]]) * self.p["GAMMA"]
        self.optimizer.zero_grad()
        l = loss(q_eval, q_target)
        l.backward()
        self.optimizer.step()

    def random(self):
        env = gym.make('CartPole-v1')
        env = env.unwrapped
        env.reset()
        rewards = []
        while True:
            env.render()
            action = env.action_space.pop(self.p['BATCH_SIZE'])
            observation, reward, done, info = env.step(action)
            rewards.append(reward)
            if done:
                break

        env.close()
        plt.ylabel("Rewards")
        plt.xlabel("Nb interactions")
        plt.plot(rewards)
        plt.grid()
        plt.show()

    def dqn_cartpole(self):
        env = gym.make('CartPole-v1')
        env = env.unwrapped
        rewards = []
        for i in range(self.p['N_EPISODE']):
            state = env.reset()
            rewards.append(0)
            for s in range(self.p['N_STEPS']):
                # env.render()
                action = self.act(state)
                n_state, reward, done, _ = env.step(action)
                if done:
                    reward = -1
                rewards[-1] += reward

                self.memory.push(state, action, n_state, reward, done)
                self.learn()
                state = n_state

            print('Episode : ', i, ', Rewards : ', rewards[-1])

            # Save the eval model after each episode
            torch.save(self.eval_dqn.state_dict(), "Model/eval_dqn.data")

        # Display result
        n = 50
        res = sum(([a]*n for a in [sum(rewards[i:i+n])//n for i in range(0,len(rewards),n)]), [])
        print(rewards)
        plt.ylabel("Rewards")
        plt.xlabel("Episode")
        plt.plot(rewards)
        plt.plot(res)
        plt.grid()
        plt.legend(['Rewards per episode', 'Last 50 runs average'])
        plt.show()
        env.close()
Beispiel #17
0
def test_ppo(args=get_args()):
    args.cfg_path = f"maps/{args.task}.cfg"
    args.wad_path = f"maps/{args.task}.wad"
    args.res = (args.skip_num, 84, 84)
    env = Env(args.cfg_path, args.frames_stack, args.res)
    args.state_shape = args.res
    args.action_shape = env.action_space.shape or env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    # make environments
    train_envs = ShmemVectorEnv([
        lambda: Env(args.cfg_path, args.frames_stack, args.res)
        for _ in range(args.training_num)
    ])
    test_envs = ShmemVectorEnv([
        lambda: Env(args.cfg_path, args.frames_stack, args.res, args.save_lmp)
        for _ in range(min(os.cpu_count() - 1, args.test_num))
    ])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # define model
    net = DQN(*args.state_shape,
              args.action_shape,
              device=args.device,
              features_only=True,
              output_dim=args.hidden_size)
    actor = Actor(net,
                  args.action_shape,
                  device=args.device,
                  softmax_output=False)
    critic = Critic(net, device=args.device)
    optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(),
                             lr=args.lr)

    lr_scheduler = None
    if args.lr_decay:
        # decay learning rate to 0 linearly
        max_update_num = np.ceil(
            args.step_per_epoch / args.step_per_collect) * args.epoch

        lr_scheduler = LambdaLR(
            optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)

    # define policy
    def dist(p):
        return torch.distributions.Categorical(logits=p)

    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       discount_factor=args.gamma,
                       gae_lambda=args.gae_lambda,
                       max_grad_norm=args.max_grad_norm,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       reward_normalization=args.rew_norm,
                       action_scaling=False,
                       lr_scheduler=lr_scheduler,
                       action_space=env.action_space,
                       eps_clip=args.eps_clip,
                       value_clip=args.value_clip,
                       dual_clip=args.dual_clip,
                       advantage_normalization=args.norm_adv,
                       recompute_advantage=args.recompute_adv).to(args.device)
    if args.icm_lr_scale > 0:
        feature_net = DQN(*args.state_shape,
                          args.action_shape,
                          device=args.device,
                          features_only=True,
                          output_dim=args.hidden_size)
        action_dim = np.prod(args.action_shape)
        feature_dim = feature_net.output_dim
        icm_net = IntrinsicCuriosityModule(feature_net.net,
                                           feature_dim,
                                           action_dim,
                                           device=args.device)
        icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr)
        policy = ICMPolicy(policy, icm_net, icm_optim, args.icm_lr_scale,
                           args.icm_reward_scale,
                           args.icm_forward_loss_weight).to(args.device)
    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)
    # replay buffer: `save_last_obs` and `stack_num` can be removed together
    # when you have enough RAM
    buffer = VectorReplayBuffer(args.buffer_size,
                                buffer_num=len(train_envs),
                                ignore_obs_next=True,
                                save_only_last_obs=True,
                                stack_num=args.frames_stack)
    # collector
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # log
    log_name = 'ppo_icm' if args.icm_lr_scale > 0 else 'ppo'
    log_path = os.path.join(args.logdir, args.task, log_name)
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = TensorboardLogger(writer)

    def save_best_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        if env.spec.reward_threshold:
            return mean_rewards >= env.spec.reward_threshold
        elif 'Pong' in args.task:
            return mean_rewards >= 20
        else:
            return False

    # watch agent's performance
    def watch():
        print("Setup test envs ...")
        policy.eval()
        test_envs.seed(args.seed)
        if args.save_buffer_name:
            print(f"Generate buffer with size {args.buffer_size}")
            buffer = VectorReplayBuffer(args.buffer_size,
                                        buffer_num=len(test_envs),
                                        ignore_obs_next=True,
                                        save_only_last_obs=True,
                                        stack_num=args.frames_stack)
            collector = Collector(policy,
                                  test_envs,
                                  buffer,
                                  exploration_noise=True)
            result = collector.collect(n_step=args.buffer_size)
            print(f"Save buffer into {args.save_buffer_name}")
            # Unfortunately, pickle will cause oom with 1M buffer size
            buffer.save_hdf5(args.save_buffer_name)
        else:
            print("Testing agent ...")
            test_collector.reset()
            result = test_collector.collect(n_episode=args.test_num,
                                            render=args.render)
        rew = result["rews"].mean()
        lens = result["lens"].mean() * args.skip_num
        print(f'Mean reward (over {result["n/ep"]} episodes): {rew}')
        print(f'Mean length (over {result["n/ep"]} episodes): {lens}')

    if args.watch:
        watch()
        exit(0)

    # test train_collector and start filling replay buffer
    train_collector.collect(n_step=args.batch_size * args.training_num)
    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              step_per_collect=args.step_per_collect,
                              stop_fn=stop_fn,
                              save_best_fn=save_best_fn,
                              logger=logger,
                              test_in_train=False)

    pprint.pprint(result)
    watch()
Beispiel #18
0
class tennis:
    def __init__(self, fps=50):
        self.GeneralReward = False
        self.net = Network(150, 450, 150, 650)
        self.updateRewardA = 0
        self.updateRewardB = 0
        self.updateIter = 0
        self.lossA = 0
        self.lossB = 0
        self.restart = False
        self.iteration = 0
        self.AgentA = DQN()
        self.AgentB = DQN()

        # Testing
        self.net = Network(150, 450, 150, 650)
        self.NetworkA = self.net.network(
            300, ysource=80, Ynew=650)  # Network A
        self.NetworkB = self.net.network(
            200, ysource=650, Ynew=80)  # Network B

        pygame.init()
        self.BLACK = (0, 0, 0)

        self.myFontA = pygame.font.SysFont("Times New Roman", 25)
        self.myFontB = pygame.font.SysFont("Times New Roman", 25)
        self.myFontIter = pygame.font.SysFont('Times New Roman', 25)

        self.FPS = fps
        self.fpsClock = pygame.time.Clock()

        self.nextplayer = np.random.choice(['A', 'B'])

    def setWindow(self):

        # set up the window
        self.DISPLAYSURF = pygame.display.set_mode((600, 750), 0, 32)
        pygame.display.set_caption(
            'REINFORCEMENT LEARNING (DQN) - TABLE TENNIS')
        # set up the colors
        self.BLACK = (0, 0, 0)
        self.WHITE = (255, 255, 255)
        self.RED = (255, 0, 0)
        self.GREEN = (0, 255, 0)
        self.BLUE = (0, 0, 255)

        return

    def display(self):
        self.setWindow()
        self.DISPLAYSURF.fill(self.WHITE)
        pygame.draw.rect(self.DISPLAYSURF, self.BLACK, (50, 100, 500, 550))
        pygame.draw.rect(self.DISPLAYSURF, self.RED, (50, 365, 500, 20))
        return

    def reset(self):
        return

    def evaluate_state_from_last_coordinate(self, c):
        """
        cmax: 550
        cmin: 50

        c definately will be between 50 and 550.
        """
        if c >= 50 and c <= 550:
            return int(c/50 - 1)
        else:
            return 0

    def evaluate_action(self, diff):

        if (int(diff) <= 50):
            return True
        else:
            return False

    def randomVal(self, action):
        "action is a probability of values between 0 and 1"
        val =  (action*500) + 50
        return val

    def play(self, action, count=0, play = 'A'):
        # play = A implies compute player A's next play.
        # play = B implies compute player B's next play.
        
        if play == 'A':
            # playerA should play
            if count == 0:
                self.NetworkA = self.net.network(
                    self.ballx, ysource=80, Ynew=650)  # Network A
                self.bally = self.NetworkA[1][count]
                self.ballx = self.NetworkA[0][count]

                if self.GeneralReward == True:
                    self.playerax = self.randomVal(action)
                else:
                    self.playerax = self.ballx


            else:
                self.ballx = self.NetworkA[0][count]
                self.bally = self.NetworkA[1][count]

            obsOne = self.evaluate_state_from_last_coordinate(
                int(self.ballx))  # last state of the ball
            obsTwo = self.evaluate_state_from_last_coordinate(
                int(self.playerbx))  # evaluate player bx
            diff = np.abs(self.ballx - self.playerbx)
            obs = obsTwo
            reward = self.evaluate_action(diff)
            done = True
            info = str(diff)

        else:
            # playerB should play
            if count == 0:
                self.NetworkB = self.net.network(
                    self.ballx, ysource=650, Ynew=80)  # Network B
                self.bally = self.NetworkB[1][count]
                self.ballx = self.NetworkB[0][count]

                if self.GeneralReward == True:
                    self.playerbx = self.randomVal(action)
                else:
                    self.playerbx = self.ballx


            else:
                self.ballx = self.NetworkB[0][count]
                self.bally = self.NetworkB[1][count]

            obsOne = self.evaluate_state_from_last_coordinate(
                int(self.ballx))  # last state of the ball
            obsTwo = self.evaluate_state_from_last_coordinate(
                int(self.playerax))  # evaluate player bx
            diff = np.abs(self.ballx - self.playerax)
            obs = obsTwo
            reward = self.evaluate_action(diff)
            done = True
            info = str(diff)

        return obs, reward, done, info

    def computeLoss(self, reward, loss = 'A'):
        # loss = A, implies compute loss of player A, otherwise, compute Player B loss.
        if loss == 'A':
            if reward == 0:
                self.lossA += 1
            else:
                self.lossA += 0
        else:
            if reward == 0:
                self.lossB += 1
            else:
                self.lossB += 0
        return

    def execute(self, state, iteration, count, player = 'A'):
        if player == 'B':
            stateB = state
            # Online DQN evaluates what to do
            
            try:
                q_valueB = self.AgentB.model.predict([stateB])
            except:
                q_valueB = 0
            actionB = self.AgentB.epsilon_greedy(q_valueB, iteration)

            # Online DQN plays
            obsB, rewardB, doneB, infoB = self.play(
                action=actionB, count=count, play = 'B')
            next_stateB = actionB

            # Let's memorize what just happened
            self.AgentB.replay_memory.append(
                (stateB, actionB, rewardB, next_stateB, 1.0 - doneB))
            stateB = next_stateB

            output = (q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB,  actionB, stateB)

        else:
            stateA = state
            # Online DQN evaluates what to do
            # arr = np.array([stateA])
            try:
                q_valueA = self.AgentB.model.predict([stateB])
            except:
                q_valueA = 0
            actionA = self.AgentA.epsilon_greedy(q_valueA, iteration)

            # Online DQN plays
            obsA, rewardA, doneA, infoA = self.play(
                action=actionA, count=count, play = 'A')
            next_stateA = actionA

            # Let's memorize what just happened
            self.AgentA.replay_memory.append(
                (stateA, actionA, rewardA, next_stateA, 1.0 - doneA))
            stateA = next_stateA

            output = (q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA,  actionA, stateA)

        return output

    def trainOnlineDQN(self, player = 'A'):
        if player == 'A':
            X_state_val, X_action_val, rewards, X_next_state_val, continues = (
                self.AgentA.sample_memories(self.AgentA.batch_size))
            arr = [X_next_state_val]
            next_q_values = self.AgentA.model.predict(arr)
            max_next_q_values = np.max(
                next_q_values, axis=1, keepdims=True)
            y_val = rewards + continues * self.AgentA.discount_rate * max_next_q_values

            # Train the online DQN
            self.AgentA.model.fit(X_state_val, tf.keras.utils.to_categorical(
                X_next_state_val, num_classes=10), verbose=0)
        else:
            X_state_val, X_action_val, rewards, X_next_state_val, continues = (
                self.AgentB.sample_memories(self.AgentB.batch_size))
            arr = [X_next_state_val]
            next_q_values = self.AgentB.model.predict(arr)
            max_next_q_values = np.max(
                next_q_values, axis=1, keepdims=True)
            y_val = rewards + continues * self.AgentB.discount_rate * max_next_q_values

            # Train the online DQN
            self.AgentB.model.fit(X_state_val, tf.keras.utils.to_categorical(
                X_next_state_val, num_classes=10), verbose=0)


        return True

    def show_board(self):
        self.display()
        # CHECK BALL MOVEMENT
        self.DISPLAYSURF.blit(self.PLAYERA, (self.playerax, 50))
        self.DISPLAYSURF.blit(self.PLAYERB, (self.playerbx, 650))
        self.DISPLAYSURF.blit(self.ball, (self.ballx, self.bally))
        self.DISPLAYSURF.blit(self.randNumLabelA, (20, 15))
        self.DISPLAYSURF.blit(self.randNumLabelB, (450, 15))


        pygame.display.update()
        self.fpsClock.tick(self.FPS)

        for event in pygame.event.get():

            if event.type == QUIT:
                # self.AgentA.model.save('models/AgentA.h5')
                # self.AgentB.model.save('models/AgentB.h5')
                pygame.quit()
                sys.exit()
        return 


    def step(self, action):
        # stepOutput: reward, next_state, done
        # action represents the next player to player, action can either be {playerA:0, playerB: 1}
        # diplay team players
        self.PLAYERA = pygame.image.load('Images/padB.png')
        self.PLAYERA = pygame.transform.scale(self.PLAYERA, (50, 50))
        self.PLAYERB = pygame.image.load('Images/padA.png')
        self.PLAYERB = pygame.transform.scale(self.PLAYERB, (50, 50))
        self.ball = pygame.image.load('Images/ball.png')
        self.ball = pygame.transform.scale(self.ball, (15, 15))

        self.playerax = 150
        self.playerbx = 250

        self.ballx = 250
        self.bally = 300

        # player A starts by playing with state 0
        obsA, rewardA, doneA, infoA = 0, False, False, ''
        obsB, rewardB, doneB, infoB = 0, False, False, ''
        state = 0
        stateA = 0
        stateB = 0
        next_stateA = 0
        next_stateB = 0
        iteration = self.iteration
        actionA = 0
        actionB = 0
        restart = False

        
        self.display()
        self.randNumLabelA = self.myFontA.render(
            'Score A: '+str(self.updateRewardA), 1, self.BLACK)
        self.randNumLabelB = self.myFontB.render(
            'Score B: '+str(self.updateRewardB), 1, self.BLACK)

        nextplayer = self.nextplayer

        if self.nextplayer == 'A':
            for count in range(50):
                if count == 0:
                    output = self.execute(state, iteration, count, player = nextplayer)
                    q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA,  actionA, stateA = output
                    state = next_stateA


                elif count == 49:

                    output = self.execute(state, iteration, count, player = 'A')
                    q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA,  actionA, stateA = output
                    state = next_stateA

                    self.updateRewardA += rewardA
                    self.computeLoss(rewardA, loss = 'A')

                    # restart the game if player A fails to get the ball, and let B start the game
                    if rewardA == 0:
                        self.restart = True
                        time.sleep(0.5)
                        self.nextplayer = 'B'
                        self.GeneralReward = False
                    else:
                        self.restart = False
                        self.GeneralReward = True

                    # Sample memories and use the target DQN to produce the target Q-Value
                    self.trainOnlineDQN(player = 'A')

                    self.nextplayer = 'B'
                    self.updateIter += 1


                else:
                    output = self.execute(state, iteration, count, player = 'A')
                    q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA,  actionA, stateA = output
                    state = next_stateA

                stepOutput = rewardA, next_stateA, doneA
                self.show_board()

        else:
            for count in range(50):
                if count == 0:
                    output = self.execute(state, iteration, count, player = 'B')
                    q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB,  actionB, stateB = output
                    state = next_stateB

                elif count == 49:

                    output = self.execute(state, iteration, count, player = 'B')
                    q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB,  actionB, stateB = output
                    state = next_stateB

                    self.updateRewardB += rewardB
                    self.computeLoss(rewardB, loss = 'B')

                    # restart the game if player A fails to get the ball, and let B start the game
                    if rewardB == 0:
                        self.restart = True
                        time.sleep(0.5)
                        self.GeneralReward = False
                        self.nextplayer = 'A'
                    else:
                        self.restart = False
                        self.GeneralReward = True

                    # Sample memories and use the target DQN to produce the target Q-Value
                    self.trainOnlineDQN(player = 'B')

                    self.nextplayer = 'A'
                    self.updateIter += 1
                    # evaluate B

                else:
                    output = self.execute(state, iteration, count, player = 'B')
                    q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB,  actionB, stateB = output
                    state = next_stateB

                stepOutput = rewardA, next_stateA, doneA
                
                self.show_board() 

        self.iteration += 1 # keep track of the total number of iterations conducted
        return stepOutput
Beispiel #19
0
def trainSQL0(file_name="SQL0",
              env=GridworldEnv(1),
              batch_size=128,
              gamma=0.999,
              beta=5,
              eps_start=0.9,
              eps_end=0.05,
              eps_decay=1000,
              is_plot=False,
              num_episodes=200,
              max_num_steps_per_episode=1000,
              learning_rate=0.0001,
              memory_replay_size=10000,
              n_step=10,
              target_update=10):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    """

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    target_model = DQN(input_size, num_actions)
    target_model.load_state_dict(model.state_dict())
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size, n_step, gamma)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0

    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
            -1, input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(model, target_model, state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, target_model, optimizer, memory, batch_size,
                           gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(
                    env.episode_total_reward
                )  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break
        if i_episode % target_update == 0 and i_episode != 0:
            target_model.load_state_dict(model.state_dict())

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results
    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations
Beispiel #20
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    brain.update_target_network()

    epsilon = 1.0
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            if episode > OBSERVE:
                epsilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Beispiel #21
0
    def __init__(self, epi, cfg=dcfg, validation=False):
        #cpu or cuda
        torch.cuda.empty_cache()
        self.device = cfg.device  #torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_dim = cfg.proc_frame_size  #State dimensionality 84x84.
        self.state_size = cfg.state_size
        #self.t_steps= tsteps
        self.t_eps = cfg.t_eps
        self.minibatch_size = cfg.minibatch_size
        # Q-learning parameters
        self.discount = cfg.discount  #Discount factor.
        self.replay_memory = cfg.replay_memory
        self.bufferSize = cfg.bufferSize
        self.target_q = cfg.target_q
        self.validation = validation
        if (validation):
            self.episode = epi
        else:
            self.episode = int(epi) - 1
        self.cfg = cfg

        modelGray = 'results/ep' + str(self.episode) + '/modelGray.net'
        modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net'
        tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net'
        tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net'

        if os.path.exists(modelGray) and os.path.exists(modelDepth):
            print("Loading model")
            self.gray_policy_net = torch.load(modelGray).to(self.device)
            self.gray_target_net = torch.load(tModelGray).to(self.device)
            self.depth_policy_net = torch.load(modelDepth).to(self.device)
            self.depth_target_net = torch.load(tModelDepth).to(self.device)

        else:
            print("New model")
            self.gray_policy_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.gray_target_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        if not validation and self.target_q and self.episode % self.target_q == 0:
            print("cloning")
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        self.gray_target_net.load_state_dict(self.gray_target_net.state_dict())
        self.gray_target_net.eval()

        self.depth_target_net.load_state_dict(
            self.depth_target_net.state_dict())
        self.depth_target_net.eval()

        self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters())
        self.depth_optimizer = optim.RMSprop(
            self.depth_policy_net.parameters())
        self.memory = ReplayMemory(self.replay_memory)
class Player :

    def __init__(self, name) :
        self.name = name

        self.initializeProperties()
        self.QNetwork = DQN(self.imageSize, "QN", self.miniBatchSize)
        self.TDTarget = DQN(self.imageSize, "TD", self.miniBatchSize)
        self.sess = tf.Session()
        self.QNetwork.setSess(self.sess)
        self.TDTarget.setSess(self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.synchronise()

    def initializeProperties(self) :
        # Q Network Constants
        self.imageSize = 80
        self.synchronisationPeriod = 500

        # Constants
        self.explorationRate = 0.999

        # Behaviour when playing & training
        self.trainable = True
        self.exploiting = False

        # Statistics
        self.score = 0

        # Training
        self.trainingData = []
        self.maxBatchSize = 10000
        # trainingData will not have more than maxBatchSize elements
        self.miniBatchSize = 32
        self.miniBatch = []
        self.startTraining = 1000
        # the training will happen iff we have more than startTraining data in trainingData

        print("Properties initialized")

    def training(self, step) :
        if not self.trainable or len(self.trainingData) < self.startTraining:
            return
        if step % self.synchronisationPeriod == 0 :
            self.synchronise()
        self.miniBatch = random.sample(self.trainingData, self.miniBatchSize)
        states, actions, rewards, nextStates = zip(*self.miniBatch)
        output = self.TDTarget.computeTarget(nextStates, rewards)
        self.QNetwork.training(states, output, actions)

    def play(self) :
        if self.exploiting or random.random() > self.explorationRate :
            return self.QNetwork.evaluate(self.buffer)
        else :
            return int(random.random() < 0.9)

    def updateConstants(self, learningRate = None, explorationRate = None) :
        self.QNetwork.updateConstants(learningRate)
        if not isinstance(explorationRate, type(None)) :
            self.explorationRate = explorationRate

    def resetStats(self) :
        self.score = 0

    def updateStats(self, reward) :
        if reward == 1 :
            self.score += 1

    def displayStats(self) :
        # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost))
        print(self.score)

    def addStateSequence(self, action, reward, nS) :
        # nS = np.transpose(nS, [1, 2, 0])
        if self.trainable :
            self.trainingData.append([self.buffer, action, reward, nS])
            while len(self.trainingData) > self.maxBatchSize :
                del self.trainingData[0]
        self.buffer = nS

    def saveQNetwork(self, path, global_step = None) :
        self.QNetwork.saveQNetwork(path, global_step)

    def restoreQNetwork(self, path, global_step = None):
        self.QNetwork.restoreQNetwork(path, global_step)

    def setBehaviour(self, isTraining) :
        self.trainable = isTraining
        self.exploiting = not isTraining

    def synchronise(self):
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(self.QNetwork.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.TDTarget.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)
        self.sess.run(update_ops)
Beispiel #23
0
class Agent:
    def __init__(
        self,
        state_size,
        action_size,
        n_agents,
        buffer_size: int = 1e5,
        batch_size: int = 256,
        gamma: float = 0.995,
        tau: float = 1e-3,
        learning_rate: float = 7e-4,
        update_every: int = 4,
    ):
        """
        Initialize DQN agent using the agent-experience buffer

        Args:
            state_size (int): Size of the state observation returned by the
                environment
            action_size (int): Action space size
            n_agents (int): Number of agents in the environment
            buffer_size (int): Desired total experience buffer size
            batch_size (int): Mini-batch size
            gamma (float): Discount factor
            tau (float): For soft update of target parameters
            learning_rate (float): Learning rate
            update_every (int): Number of steps before target network update
        """

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents

        # Q-Networks
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=learning_rate)
        self.memory = AgentReplayMemory(buffer_size, n_agents, state_size,
                                        device)

        self.t_step = 0

        self.update_every = update_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

    def step(self, states, actions, rewards, next_steps, done):

        self.memory.push_agent_actions(states, actions, rewards, next_steps,
                                       done)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if self.memory.at_capacity():
                experience = self.memory.sample(self.batch_size)
                self.learn(experience, self.gamma)

    def act(self, states, eps=0):
        states = torch.from_numpy(states).float().to(device)
        self.policy_net.eval()

        with torch.no_grad():
            action_values = self.policy_net(states)
        self.policy_net.train()

        r = np.random.random(size=self.n_agents)

        action_values = np.argmax(action_values.cpu().data.numpy(), axis=1)
        random_choices = np.random.randint(0,
                                           self.action_size,
                                           size=self.n_agents)

        return np.where(r > eps, action_values, random_choices)

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        criterion = torch.nn.MSELoss()
        self.policy_net.train()
        self.target_net.eval()

        # shape of output from the model (batch_size,action_dim) = (64,4)
        predicted_targets = self.policy_net(states).gather(1, actions)

        with torch.no_grad():
            labels_next = self.target_net(next_states).detach().max(
                1)[0].unsqueeze(1)

        # .detach() ->  Returns a new Tensor, detached from the current graph.
        labels = rewards + (gamma * labels_next * (1 - dones))

        loss = criterion(predicted_targets, labels).to(device)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.policy_net, self.target_net, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.

        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
Beispiel #24
0
class TrainNQL:
    def __init__(self, epi, cfg=dcfg, validation=False):
        #cpu or cuda
        torch.cuda.empty_cache()
        self.device = cfg.device  #torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_dim = cfg.proc_frame_size  #State dimensionality 84x84.
        self.state_size = cfg.state_size
        #self.t_steps= tsteps
        self.t_eps = cfg.t_eps
        self.minibatch_size = cfg.minibatch_size
        # Q-learning parameters
        self.discount = cfg.discount  #Discount factor.
        self.replay_memory = cfg.replay_memory
        self.bufferSize = cfg.bufferSize
        self.target_q = cfg.target_q
        self.validation = validation
        if (validation):
            self.episode = epi
        else:
            self.episode = int(epi) - 1
        self.cfg = cfg

        modelGray = 'results/ep' + str(self.episode) + '/modelGray.net'
        modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net'
        tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net'
        tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net'

        if os.path.exists(modelGray) and os.path.exists(modelDepth):
            print("Loading model")
            self.gray_policy_net = torch.load(modelGray).to(self.device)
            self.gray_target_net = torch.load(tModelGray).to(self.device)
            self.depth_policy_net = torch.load(modelDepth).to(self.device)
            self.depth_target_net = torch.load(tModelDepth).to(self.device)

        else:
            print("New model")
            self.gray_policy_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.gray_target_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        if not validation and self.target_q and self.episode % self.target_q == 0:
            print("cloning")
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        self.gray_target_net.load_state_dict(self.gray_target_net.state_dict())
        self.gray_target_net.eval()

        self.depth_target_net.load_state_dict(
            self.depth_target_net.state_dict())
        self.depth_target_net.eval()

        self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters())
        self.depth_optimizer = optim.RMSprop(
            self.depth_policy_net.parameters())
        self.memory = ReplayMemory(self.replay_memory)

    def get_tensor_from_image(self, file):
        convert = T.Compose([
            T.ToPILImage(),
            T.Resize((self.state_dim, self.state_dim),
                     interpolation=Image.BILINEAR),
            T.ToTensor()
        ])
        screen = Image.open(file)
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        screen = convert(screen).unsqueeze(0).to(self.device)
        return screen

    def get_data(self, episode, tsteps):
        #images=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device)
        #depths=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device)
        images = []
        depths = []
        dirname_rgb = 'dataset/RGB/ep' + str(episode)
        dirname_dep = 'dataset/Depth/ep' + str(episode)
        for step in range(tsteps):
            #proc_image=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device)
            #proc_depth=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device)
            proc_image = []
            proc_depth = []

            dirname_rgb = 'dataset/RGB/ep' + str(episode)
            dirname_dep = 'dataset/Depth/ep' + str(episode)
            for i in range(self.state_size):
                grayfile = dirname_rgb + '/image_' + str(step + 1) + '_' + str(
                    i + 1) + '.png'
                depthfile = dirname_dep + '/depth_' + str(
                    step + 1) + '_' + str(i + 1) + '.png'
                #proc_image[i] = self.get_tensor_from_image(grayfile)
                #proc_depth[i] = self.get_tensor_from_image(depthfile)
                proc_image.append(grayfile)
                proc_depth.append(depthfile)
            #images[step]=proc_image
            #depths[step]=proc_depth
            images.append(proc_image)
            depths.append(proc_depth)
        return images, depths

    def load_data(self):

        rewards = torch.load('files/reward_history.dat')
        actions = torch.load('files/action_history.dat')
        ep_rewards = torch.load('files/ep_rewards.dat')

        print("Loading images")

        best_scores = range(len(actions))
        buffer_selection_mode = 'default'

        if (buffer_selection_mode == 'success_handshake'):
            eps_values = []
            for i in range(len(actions)):

                hspos = 0
                hsneg = 0
                for step in range(len(actions[i])):
                    if (len(actions[i]) > 0):
                        if actions[i][step] == 3:
                            if rewards[i][step] > 0:
                                hspos = hspos + 1
                            elif rewards[i][step] == -0.1:
                                hsneg = hsneg + 1
                accuracy = float(((hspos) / (hspos + hsneg)))
                eps_values.append(accuracy)

            best_scores = np.argsort(eps_values)

        for i in best_scores:
            print('Ep: ', i + 1)
            dirname_gray = 'dataset/RGB/ep' + str(i + 1)
            dirname_dep = 'dataset/Depth/ep' + str(i + 1)
            files = []
            if (os.path.exists(dirname_gray)):
                files = os.listdir(dirname_gray)

            k = 0
            for file in files:
                if re.match(r"image.*\.png", file):
                    k = k + 1
            k = int(k / 8)
            while (k % 4 != 0):
                k = k - 1
            if (k > self.bufferSize):
                k = self.bufferSize
            print(k)

            #os.system("free -h")
            #with torch.no_grad():
            images, depths = self.get_data(i + 1, k)
            print("Loading done")

            for step in range(k - 1):
                #print(len(rewards),i)
                #print(len(rewards[i]), step)
                reward = self.cfg.neutral_reward
                if rewards[i][step] >= 1:
                    reward = self.cfg.hs_success_reward
                elif rewards[i][step] < 0:
                    reward = self.cfg.hs_fail_reward
                reward = torch.tensor([reward], device=self.device)
                action = torch.tensor([[actions[i][step]]],
                                      device=self.device,
                                      dtype=torch.long)
                #image = images[step].unsqueeze(0).to(self.device)
                #depth = depths[step].unsqueeze(0).to(self.device)
                #next_image = images[step+1].unsqueeze(0).to(self.device)
                #next_depth = depths[step+1].unsqueeze(0).to(self.device)
                image = images[step]
                depth = depths[step]
                next_image = images[step + 1]
                next_depth = depths[step + 1]
                self.memory.push(image, depth, action, next_image, next_depth,
                                 reward)
                #print("Memory size: ",getsizeof(self.memory))
                #torch.cuda.empty_cache()

    def train(self):
        if len(self.memory) < self.minibatch_size:
            return
        for i in range(0, len(self.memory), self.minibatch_size):
            #transitions = self.memory.sample(self.minibatch_size)
            transitions = self.memory.pull(self.minibatch_size)

            print('Batch train: ' + str(int(i / self.minibatch_size) + 1) +
                  "/" + str(int(len(self.memory) / self.minibatch_size) + 1))

            aux_transitions = []
            for t in transitions:
                proc_sgray = torch.Tensor(self.state_size, self.state_dim,
                                          self.state_dim).to(self.device)
                proc_sdepth = torch.Tensor(self.state_size, self.state_dim,
                                           self.state_dim).to(self.device)
                proc_next_sgray = torch.Tensor(self.state_size, self.state_dim,
                                               self.state_dim).to(self.device)
                proc_next_sdepth = torch.Tensor(self.state_size,
                                                self.state_dim,
                                                self.state_dim).to(self.device)
                count = 0
                for sgray, sdepth, next_sgray, next_sdepth in zip(
                        t.sgray, t.sdepth, t.next_sgray, t.next_sdepth):
                    proc_sgray[count] = self.get_tensor_from_image(sgray)
                    proc_sdepth[count] = self.get_tensor_from_image(sdepth)
                    proc_next_sgray[count] = self.get_tensor_from_image(
                        next_sgray)
                    proc_next_sdepth[count] = self.get_tensor_from_image(
                        next_sdepth)
                    count += 1

                proc_sgray = proc_sgray.unsqueeze(0).to(self.device)
                proc_sdepth = proc_sdepth.unsqueeze(0).to(self.device)
                proc_next_sgray = proc_next_sgray.unsqueeze(0).to(self.device)
                proc_next_sdepth = proc_next_sdepth.unsqueeze(0).to(
                    self.device)
                #('sgray','sdepth','action','next_sgray','next_sdepth','reward')
                one_transition = Transition(proc_sgray, proc_sdepth, t.action,
                                            proc_next_sgray, proc_next_sdepth,
                                            t.reward)
                aux_transitions.append(one_transition)
            transitions = aux_transitions

            # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
            # detailed explanation). This converts batch-array of Transitions
            # to Transition of batch-arrays.
            batch = Transition(*zip(*transitions))
            #print(batch.sgray)

            # Compute a mask of non-final states and concatenate the batch elements
            # (a final state would've been the one after which simulation ended)
            gray_non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, batch.next_sgray)),
                                               device=self.device,
                                               dtype=torch.bool)
            gray_non_final_next_states = torch.cat(
                [s for s in batch.next_sgray if s is not None])

            depth_non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, batch.next_sdepth)),
                                                device=self.device,
                                                dtype=torch.bool)
            depth_non_final_next_states = torch.cat(
                [s for s in batch.next_sdepth if s is not None])
            sgray_batch = torch.cat(batch.sgray)
            sdepth_batch = torch.cat(batch.sdepth)

            action_batch = torch.cat(batch.action)
            reward_batch = torch.cat(batch.reward)

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken. These are the actions which would've been taken
            # for each batch state according to policy_net
            sgray_action_values = self.gray_policy_net(sgray_batch).gather(
                1, action_batch)
            sdepth_action_values = self.depth_policy_net(sdepth_batch).gather(
                1, action_batch)

            # Compute V(s_{t+1}) for all next states.
            # Expected values of actions for non_final_next_states are computed based
            # on the "older" target_net; selecting their best reward with max(1)[0].
            # This is merged based on the mask, such that we'll have either the expected
            # state value or 0 in case the state was final.
            next_sgray_values = torch.zeros(self.minibatch_size,
                                            device=self.device)
            next_sgray_values[gray_non_final_mask] = self.gray_target_net(
                gray_non_final_next_states).max(1)[0].detach()

            next_sdepth_values = torch.zeros(self.minibatch_size,
                                             device=self.device)
            next_sdepth_values[depth_non_final_mask] = self.depth_target_net(
                depth_non_final_next_states).max(1)[0].detach()
            # Compute the expected Q values
            expected_sgray_action_values = (next_sgray_values *
                                            self.discount) + reward_batch
            expected_sdepth_action_values = (next_sdepth_values *
                                             self.discount) + reward_batch

            # Compute Huber loss
            gray_loss = F.smooth_l1_loss(
                sgray_action_values, expected_sgray_action_values.unsqueeze(1))
            depth_loss = F.smooth_l1_loss(
                sdepth_action_values,
                expected_sdepth_action_values.unsqueeze(1))

            # Optimize the model
            self.gray_optimizer.zero_grad()
            gray_loss.backward()
            for param in self.gray_policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.gray_optimizer.step()

            # Optimize the model
            self.depth_optimizer.zero_grad()
            depth_loss.backward()
            for param in self.depth_policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.depth_optimizer.step()
def trainD(file_name="Distral_1col",
           list_of_envs=[GridworldEnv(4), GridworldEnv(5)],
           batch_size=128,
           gamma=0.999,
           alpha=0.9,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(num_actions)
    models = [DQN(num_actions)
              for _ in range(0, num_envs)]  ### Add torch.nn.ModuleList (?)
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))

            # last_screen = env.current_grid_map
            current_screen = get_screen(env)
            state = current_screen  # - last_screen
            # Select and perform an action
            action = select_action(state, policy, models[i_env], num_actions,
                                   eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)
            steps_done[i_env] += 1
            current_time[i_env] += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)
            if done:
                print(
                    "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:",
                    env.episode_total_reward, "\tit:", current_time[i_env],
                    "\texp_factor:", eps_end + (eps_start - eps_end) *
                    math.exp(-1. * episodes_done[i_env] / eps_decay))
                env.reset()
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
Beispiel #26
0
    return gray

# epsilon greedy


def pick_action(observation, net):
   if(random.random() < epsilon):
        return random.randint(0, num_actions-1)

    action = torch.argmax(
        net(torch.tensor(observation).float().unsqueeze(0)))

    return action


net = DQN()
net.load_state_dict(torch.load("model.h5", map_location="cpu"))
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)
starttime = time.time()
buffer = collections.deque(maxlen=N)
lr = 1e-3


for i in range(num_episodes):
    observation = env.reset()
    observation = preprocess(observation)
    observation = [observation, observation, observation, observation]
    j = 0

    while(True):
Beispiel #27
0
def trainDQN(file_name="DQN",
             env=GridworldEnv(1),
             batch_size=128,
             gamma=0.999,
             eps_start=0.9,
             eps_end=0.05,
             eps_decay=1000,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.0001,
             memory_replay_size=10000):
    """
    DQN training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.title("")
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    model = DQN(num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []
    steps_done = 0  # total steps
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        print("Cur episode:", i_episode, "steps done:", steps_done,
                "exploration factor:", eps_end + (eps_start - eps_end) * \
                math.exp(-1. * steps_done / eps_decay))
        # Initialize the environment and state
        env.reset()
        # last_screen = env.current_grid_map
        # (1, 1, 8, 8)
        current_screen = get_screen(env)
        state = current_screen  # - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            steps_done += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma)
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-dqn-rewards', episode_rewards)
    np.save(file_name + '-dqn-durations', episode_durations)

    return model, episode_rewards, episode_durations
Beispiel #28
0
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
            GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    # action dimension
    num_actions = list_of_envs[0].action_space.n
    # total envs
    num_envs = len(list_of_envs)
    # pi_0
    policy = PolicyNetwork(num_actions)
    # Q value, every environment has one, used to calculate A_i,
    models = [DQN(num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    # replay buffer for env ???
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # device = "cpu"
    print(device)
    # model
    policy = policy.to(device)
    for i in range(len(models)):
        models[i] = models[i].to(device)

    # optimizer for every Q model
    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    # optimizer for policy
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    # info list for each environment
    episode_durations = [[] for _ in range(num_envs)]   # list of local steps
    episode_rewards = [[] for _ in range(num_envs)]     # list of list of episode reward

    episodes_done = np.zeros(num_envs)      # episode num
    steps_done = np.zeros(num_envs)         # global timesteps for each env
    current_time = np.zeros(num_envs)       # local timesteps for each env

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        policy.train()
        for model in models:
            model.train()

        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        #   1. do the step for each env
        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))
        
            # last_screen = env.current_grid_map
            # ===========update step info begin========================
            current_screen = get_screen(env)
            # state
            state = current_screen # - last_screen
            # action chosen by pi_1~pi_i
            action = select_action(state, policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta, device)
            # global_steps
            steps_done[i_env] += 1
            # local steps
            current_time[i_env] += 1
            # reward
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # next state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen # - last_screen
            else:
                next_state = None

            # add to buffer
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            #   2. do one optimization step for each env using "soft-q-learning".
            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma, device)
            # ===========update step info end ========================


            # ===========update episode info begin ====================
            if done:
                print("ENV:", i_env, "iter:", episodes_done[i_env],
                    "\treward:", env.episode_total_reward,
                    "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                    (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))
                # reset env
                env.reset()
                # episode steps
                episodes_done[i_env] += 1
                # append each episode local timesteps list for every env
                episode_durations[i_env].append(current_time[i_env])
                # reset local timesteps
                current_time[i_env] = 0
                # append total episode_reward to list
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)
            # ===========update episode info end ====================

        #   3. do one optimization step for the policy
        # after all envs has performed one step, optimize policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, device)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
Beispiel #29
0
def test_dqn(args=get_args()):
    env = make_atari_env(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    # make environments
    train_envs = SubprocVectorEnv(
        [lambda: make_atari_env(args) for _ in range(args.training_num)])
    test_envs = SubprocVectorEnv(
        [lambda: make_atari_env_watch(args) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # define model
    net = DQN(*args.state_shape, args.action_shape,
              args.device).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    # define policy
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       target_update_freq=args.target_update_freq)
    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)
    # replay buffer: `save_last_obs` and `stack_num` can be removed together
    # when you have enough RAM
    buffer = VectorReplayBuffer(args.buffer_size,
                                buffer_num=len(train_envs),
                                ignore_obs_next=True,
                                save_only_last_obs=True,
                                stack_num=args.frames_stack)
    # collector
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # log
    log_path = os.path.join(args.logdir, args.task, 'dqn')
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = BasicLogger(writer)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        if env.env.spec.reward_threshold:
            return mean_rewards >= env.spec.reward_threshold
        elif 'Pong' in args.task:
            return mean_rewards >= 20
        else:
            return False

    def train_fn(epoch, env_step):
        # nature DQN setting, linear decay in the first 1M steps
        if env_step <= 1e6:
            eps = args.eps_train - env_step / 1e6 * \
                (args.eps_train - args.eps_train_final)
        else:
            eps = args.eps_train_final
        policy.set_eps(eps)
        logger.write('train/eps', env_step, eps)

    def test_fn(epoch, env_step):
        policy.set_eps(args.eps_test)

    # watch agent's performance
    def watch():
        print("Setup test envs ...")
        policy.eval()
        policy.set_eps(args.eps_test)
        test_envs.seed(args.seed)
        if args.save_buffer_name:
            print(f"Generate buffer with size {args.buffer_size}")
            buffer = VectorReplayBuffer(args.buffer_size,
                                        buffer_num=len(test_envs),
                                        ignore_obs_next=True,
                                        save_only_last_obs=True,
                                        stack_num=args.frames_stack)
            collector = Collector(policy, test_envs, buffer)
            result = collector.collect(n_step=args.buffer_size)
            print(f"Save buffer into {args.save_buffer_name}")
            # Unfortunately, pickle will cause oom with 1M buffer size
            buffer.save_hdf5(args.save_buffer_name)
        else:
            print("Testing agent ...")
            test_collector.reset()
            result = test_collector.collect(n_episode=args.test_num,
                                            render=args.render)
        pprint.pprint(result)

    if args.watch:
        watch()
        exit(0)

    # test train_collector and start filling replay buffer
    train_collector.collect(n_step=args.batch_size * args.training_num)
    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.step_per_collect,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               logger=logger,
                               update_per_step=args.update_per_step,
                               test_in_train=False)

    pprint.pprint(result)
    watch()
Beispiel #30
0
class Player:
    def __init__(self, name, isBot):
        self.name = name
        self.isBot = isBot
        if not self.isBot:
            self.chosenAction = 0
            self.defineKeyboardListener()

        self.initializeProperties()
        self.QNetwork = DQN("QN{}".format(name), self.miniBatchSize)
        self.TDTarget = DQN("TD{}".format(name), self.miniBatchSize)
        self.sess = tf.Session()
        self.QNetwork.setSess(self.sess)
        self.TDTarget.setSess(self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.synchronise()

    def initializeProperties(self):
        self.synchronisationPeriod = 100
        self.explorationRate = 0.999

        # Behaviour when playing & training
        self.trainable = True
        self.exploiting = False

        # Statistics
        self.gamesWon = 0
        self.gamesLost = 0

        # Training
        self.trainingData = []
        self.maxBatchSize = 50000
        # trainingData will not have more than maxBatchSize elements
        self.miniBatchSize = 32
        self.miniBatch = []
        self.startTraining = 1000
        # the training will happen iff we have more than startTraining data in trainingData

        print("Properties initialized")

    def defineKeyboardListener(self):
        def on_press(key):
            try:
                if key == Key.up:
                    self.chosenAction = 1
                elif key == Key.down:
                    self.chosenAction = 2
                else:
                    self.chosenAction = 0
            except AttributeError:
                self.chosenAction = 0

        def on_release(key):
            self.chosenAction = 0
            if key == keyboard.Key.esc:
                # Stop listener
                return False

        self.listener = keyboard.Listener(on_press=on_press,
                                          on_release=on_release)
        self.listener.start()

    def training(self, step):
        if not self.trainable or len(self.trainingData) < self.startTraining:
            return
        if step % self.synchronisationPeriod == 0:
            self.synchronise()
        self.miniBatch = random.sample(self.trainingData, self.miniBatchSize)
        states, actions, rewards, nextStates = zip(*self.miniBatch)
        output = self.TDTarget.computeTarget(nextStates, rewards)
        self.QNetwork.training(states, output, actions)

    def play(self):
        if self.isBot:
            if self.exploiting or random.random() > self.explorationRate:
                return self.QNetwork.evaluate(self.buffer)
            else:
                return random.randint(0, 1)
        else:
            return self.chosenAction

    def updateConstants(self, learningRate=None, explorationRate=None):
        self.QNetwork.updateConstants(learningRate)
        if not isinstance(explorationRate, type(None)):
            self.explorationRate = explorationRate

    def resetStats(self):
        self.gamesWon = 0
        self.gamesLost = 0

    def updateStats(self, reward):
        if reward == 1:
            self.gamesWon += 1
        elif reward == -1:
            self.gamesLost += 1

    def displayStats(self):
        # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost))
        print(self.gamesWon, self.gamesLost)

    def addStateSequence(self, action, reward, nextState):
        if self.trainable:
            self.trainingData.append([self.buffer, action, reward, nextState])
            while len(self.trainingData) > self.maxBatchSize:
                self.trainingData.pop(0)
        self.buffer = nextState

    def saveQNetwork(self, path, global_step=None):
        self.QNetwork.saveQNetwork(path, global_step)

    def restoreQNetwork(self, path, global_step=None):
        self.QNetwork.restoreQNetwork(path, global_step)

    def setBehaviour(self, isTraining):
        self.trainable = isTraining
        self.exploiting = not isTraining

    def synchronise(self):
        e1_params = [
            t for t in tf.trainable_variables()
            if t.name.startswith(self.QNetwork.scope)
        ]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [
            t for t in tf.trainable_variables()
            if t.name.startswith(self.TDTarget.scope)
        ]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)
        self.sess.run(update_ops)