Ejemplo n.º 1
0
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """
        super(Agent_Dueling_DQN,self).__init__(env)

        
        ##################
        # YOUR CODE HERE #
        ##################
        self.env = env
        self.gamma = 0.99
        self.lr = 0.0001
        self.epsilon = 1.0 
        self.memory_size = 50000
        self.batch_size = 32
        self.replace_target_iteration = 10000
        self.learn_step_counter = 0
        self._build_net()
        self.memory = ReplayMemory(self.memory_size)
        saver = tf.train.Saver()
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        with tf.variable_scope('soft_replacement'):
            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
        self.summaries = tf.summary.merge_all()
        self.sess = tf.Session()
        if args.test_dueling_dqn:
            #you can load your model here           
            saver.restore(self.sess, "Model_deuling/model_deuling_dqn.ckpt")
            print('loading trained model')
        else:
            self.writer = tf.summary.FileWriter("logs/", self.sess.graph)
            self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 2
0
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """
        super(Agent_DDDQN,self).__init__(env)

        
        ##################
        # YOUR CODE HERE #
        ##################
        self.env = env
        self.gamma = 0.99
        self.lr = 0.0001
        self.epsilon = 1.0 
        self.memory_size = 50000
        self.batch_size = 32
        self.replace_target_iteration = 10000
        self.learn_step_counter = 0
        # ===================== all inputs =========================
        self.state = tf.placeholder(tf.float32, [None, 84, 84, 4], name = 'state')
        self.state_ = tf.placeholder(tf.float32, [None, 84, 84, 4], name = 'state_')
        self.reward = tf.placeholder(tf.float32, [None, ], name = 'reward')
        self.action = tf.placeholder(tf.int32 , [None, ], name = 'action')
        self.q_next = tf.placeholder(tf.float32, [None, ], name = 'q_next')
        self.terminal = tf.placeholder(tf.float32, [None, ], name = 'terminal')
        self.is_training = tf.placeholder(tf.bool, name = 'is_training')
        self.keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')

        self._build_net()
        self.memory = ReplayMemory(self.memory_size)
        saver = tf.train.Saver()
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        with tf.variable_scope('soft_replacement'):
            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
        self.summaries = tf.summary.merge_all()
        self.sess = tf.Session()
        if args.test_double_dueling_dqn:
            #you can load your model here           
            saver.restore(self.sess, "Model_double_deuling/model_double_deuling_dqn.ckpt")
            print('loading trained model')
        else:
            self.writer = tf.summary.FileWriter("logs/", self.sess.graph)
            self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 3
0
    def __init__(self, env):

        self.env = env
        self.sess = tf.Session()

        self.batch_len = 32
        self.gamma = 0.99
        self.alpha = 0.0005
        self.memory_cap = 10000

        self.epsilon_start = 0.99
        self.eps = self.epsilon_start

        self.clone_stps = 500
        self.inp_size = env.observation_space.shape[0]

        self.replay_memory = ReplayMemory(100000)

        self.min_replay_size = 10000
        self.memory = np.zeros((self.memory_cap, self.inp_size * 2 + 2))
        self.actions = env.action_space.n

        self.observation_input = tf.placeholder(tf.float32,
                                                [None, self.inp_size])
        self.observation_input_target = tf.placeholder(tf.float32,
                                                       [None, self.inp_size])
        self.q_target = tf.placeholder(tf.float32, [None, self.actions],
                                       name='Q_target')
        self.train_network = self.build_model(self.observation_input)
        self.target_network = self.build_model(self.observation_input_target,
                                               'target')

        t_params = tf.get_collection('target_params')
        e_params = tf.get_collection('train_params')
        self.replace_target_op = [
            tf.assign(t, e) for t, e in zip(t_params, e_params)
        ]

        self.loss = tf.reduce_mean(
            tf.losses.huber_loss(self.q_target, self.train_network))
        self.reducer = tf.train.AdamOptimizer(self.alpha).minimize(self.loss)

        self.num_episodes = 0
        self.num_steps = 0
        self.cost_his = []

        self.saver = tf.train.Saver(tf.trainable_variables())
        self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 4
0
def trainDQN(file_name="DQN",
             env=GridworldEnv(1),
             batch_size=128,
             gamma=0.999,
             eps_start=0.9,
             eps_end=0.05,
             eps_decay=1000,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.0001,
             memory_replay_size=10000):
    """
    DQN training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.title("")
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    model = DQN(num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []
    steps_done = 0  # total steps
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        print("Cur episode:", i_episode, "steps done:", steps_done,
                "exploration factor:", eps_end + (eps_start - eps_end) * \
                math.exp(-1. * steps_done / eps_decay))
        # Initialize the environment and state
        env.reset()
        # last_screen = env.current_grid_map
        # (1, 1, 8, 8)
        current_screen = get_screen(env)
        state = current_screen  # - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            steps_done += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma)
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-dqn-rewards', episode_rewards)
    np.save(file_name + '-dqn-durations', episode_durations)

    return model, episode_rewards, episode_durations
Ejemplo n.º 5
0
def trainSQL0(file_name="SQL0",
              env=GridworldEnv(1),
              batch_size=128,
              gamma=0.999,
              beta=5,
              eps_start=0.9,
              eps_end=0.05,
              eps_decay=1000,
              is_plot=False,
              num_episodes=200,
              max_num_steps_per_episode=1000,
              learning_rate=0.0001,
              memory_replay_size=10000,
              n_step=10,
              target_update=10):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    """

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    target_model = DQN(input_size, num_actions)
    target_model.load_state_dict(model.state_dict())
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size, n_step, gamma)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0

    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
            -1, input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(model, target_model, state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, target_model, optimizer, memory, batch_size,
                           gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(
                    env.episode_total_reward
                )  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break
        if i_episode % target_update == 0 and i_episode != 0:
            target_model.load_state_dict(model.state_dict())

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results
    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations
Ejemplo n.º 6
0
class Agent_Dueling_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """
        super(Agent_Dueling_DQN,self).__init__(env)

        
        ##################
        # YOUR CODE HERE #
        ##################
        self.env = env
        self.gamma = 0.99
        self.lr = 0.0001
        self.epsilon = 1.0 
        self.memory_size = 50000
        self.batch_size = 32
        self.replace_target_iteration = 10000
        self.learn_step_counter = 0
        self._build_net()
        self.memory = ReplayMemory(self.memory_size)
        saver = tf.train.Saver()
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        with tf.variable_scope('soft_replacement'):
            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
        self.summaries = tf.summary.merge_all()
        self.sess = tf.Session()
        if args.test_dueling_dqn:
            #you can load your model here           
            saver.restore(self.sess, "Model_deuling/model_deuling_dqn.ckpt")
            print('loading trained model')
        else:
            self.writer = tf.summary.FileWriter("logs/", self.sess.graph)
            self.sess.run(tf.global_variables_initializer())

    def _weight_variables(self, shape, name):
        initializer = tf.random_normal_initializer(mean = 0., stddev = 0.02)
        return tf.get_variable(shape = shape, initializer = initializer, name = name) 
    def _bias_variables(self, shape, name):
        initializer = tf.constant_initializer(0.0)
        return tf.get_variable(shape = shape, initializer = initializer, name = name)
    def conv2d(self, x, W, stride):
        return tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')

    def max_pool_2x2(self, x):
        return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')   
    def batch_norm(self, x, n_out, is_training, type):
        '''
        Args:
            x:           Tensor, 4D BHWD input maps
            n_out:       integer, depth of input maps
            phase_train: boolean tf.Varialbe, true indicates training phase
            scope:       string, variable scope
        Return:
            normed:      batch-normalized maps
        '''
        if type == 'CNN':
            dim_normalization = [0, 1, 2]
        else:
            dim_normalization = [0]
        with tf.variable_scope('bn'):
            beta = tf.get_variable(shape = [n_out], initializer = tf.constant_initializer(0.0), name = 'beta')
            gamma = tf.get_variable(shape = [n_out], initializer = tf.constant_initializer(1.0), name = 'gamma')

            batch_mean, batch_var = tf.nn.moments(x, dim_normalization, name = 'moments')
            ema = tf.train.ExponentialMovingAverage(decay = 0.5)

            def mean_var_with_update():
                ema_apply_op = ema.apply([batch_mean, batch_var])
                with tf.control_dependencies([ema_apply_op]):
                    return tf.identity(batch_mean), tf.identity(batch_var)

            mean, var = tf.cond(is_training, mean_var_with_update,
                                lambda:(ema.average(batch_mean), ema.average(batch_var)))
            normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
        return normed

    def _build_net(self):
        # ===================== all inputs =========================
        self.state = tf.placeholder(tf.float32, [None, 84, 84, 4], name = 'state')
        self.state_ = tf.placeholder(tf.float32, [None, 84, 84, 4], name = 'state_')
        self.reward = tf.placeholder(tf.float32, [None, ], name = 'reward')
        self.action = tf.placeholder(tf.int32 , [None, ], name = 'action')
        self.terminal = tf.placeholder(tf.float32, [None, ], name = 'terminal')
        self.is_training = tf.placeholder(tf.bool, name = 'is_training')
        self.keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
        # ===================== build evaluate net =========================
        with tf.variable_scope('eval_net'):
            with tf.variable_scope('conv_1'):
                W_conv1 = self._weight_variables([8, 8, 4, 16], name = 'w_conv1')
                b_conv1 = self._bias_variables([16], name = 'b_conv1')
                h_conv1 = tf.nn.relu(self.conv2d(self.state, W_conv1, 4) + b_conv1)
            #    h_conv1 = tf.nn.relu(self.batch_norm((self.conv2d(self.state, W_conv1, 4) + b_conv1), 16, self.is_training, 'CNN'))
            #    h_pool1 = self.max_pool_2x2(h_conv1)
            # 21*21*32
                print('h_conv1.shape : ', h_conv1.shape)
            with tf.variable_scope('conv_2'):
                W_conv2 = self._weight_variables([4, 4, 16, 32], name = 'w_conv2')
                b_conv2 = self._bias_variables([32], name = 'b_conv2')
                h_conv2 = tf.nn.relu(self.conv2d(h_conv1, W_conv2, 2) + b_conv2)
            #    h_conv2 = tf.nn.relu(self.batch_norm((self.conv2d(h_conv1, W_conv2, 2) + b_conv2), 32, self.is_training, 'CNN'))
            #    h_pool2 = self.max_pool_2x2(h_conv2)
            # 11*11*64
                print('h_conv2.shape : ', h_conv2.shape)
            with tf.variable_scope('conv_3'):
                W_conv3 = self._weight_variables([3, 3, 32, 64], name = 'w_conv3')
                b_conv3 = self._bias_variables([64], name = 'b_conv3')
                h_conv3 = tf.nn.relu(self.conv2d(h_conv2, W_conv3, 1) + b_conv3)
            #    h_conv3 = tf.nn.relu(self.batch_norm((self.conv2d(h_conv2, W_conv3, 1) + b_conv3), 64, self.is_training, 'CNN'))
            #    h_pool3 = self.max_pool_2x2(h_conv3)
                print('h_conv3.shape : ', h_conv3.shape)
                flatten = tf.reshape(h_conv3, [-1, 11*11*64])
            # 11*11*64
            with tf.variable_scope('fc1'):
                W_adv1 = self._weight_variables([11*11*64, 512], name = 'w_adv1')
                b_adv1 = self._bias_variables([512], name = 'b_adv1')
                h_adv1 = tf.nn.relu(tf.matmul(flatten, W_adv1) + b_adv1)

                W_val1 = self._weight_variables([11*11*64, 512], name = 'w_val1')
                b_val1 = self._bias_variables([512], name = 'b_val1')
                h_val1 = tf.nn.relu(tf.matmul(flatten, W_val1) + b_val1)

            with tf.variable_scope('output'):
                W_adv2 = self._weight_variables([512, 4], name = 'w_adv2')
                b_adv2 = self._bias_variables([4], name = 'b_adv2')
                eval_advantage = tf.matmul(h_adv1, W_adv2) + b_adv2

                W_val2 = self._weight_variables([512, 1], name = 'w_val2')
                b_val2 = self._bias_variables([1], name = 'b_val2')
                eval_value = tf.matmul(h_val1, W_val2) + b_val2
                # Average Deuling
                self.q_eval = eval_value + (eval_advantage - tf.reduce_mean(eval_advantage, axis=1, keep_dims = True))
        # ===================== build target net =========================
        with tf.variable_scope('target_net'):
            with tf.variable_scope('conv_1'):
                W_conv1 = self._weight_variables([8, 8, 4, 16], name = 'w_conv1')
                b_conv1 = self._bias_variables([16], name = 'b_conv1')
                h_conv1 = tf.nn.relu(self.conv2d(self.state_, W_conv1, 4) + b_conv1)
            #    h_conv1 = tf.nn.relu(self.batch_norm((self.conv2d(self.state_, W_conv1, 4) + b_conv1), 16, self.is_training, 'CNN'))
            #    h_pool1 = self.max_pool_2x2(h_conv1)
            # 21*21*32
                print('h_conv1.shape : ', h_conv1.shape)
            with tf.variable_scope('conv_2'):
                W_conv2 = self._weight_variables([4, 4, 16, 32], name = 'w_conv2')
                b_conv2 = self._bias_variables([32], name = 'b_conv2')
                h_conv2 = tf.nn.relu(self.conv2d(h_conv1, W_conv2, 2) + b_conv2)
            #    h_conv2 = tf.nn.relu(self.batch_norm((self.conv2d(h_conv1, W_conv2, 2) + b_conv2), 32, self.is_training, 'CNN'))
            #    h_pool2 = self.max_pool_2x2(h_conv2)
            # 11*11*64
                print('h_conv2.shape : ', h_conv2.shape)
            with tf.variable_scope('conv_3'):
                W_conv3 = self._weight_variables([3, 3, 32, 64], name = 'w_conv3')
                b_conv3 = self._bias_variables([64], name = 'b_conv3')
                h_conv3 = tf.nn.relu(self.conv2d(h_conv2, W_conv3, 1) + b_conv3)
            #    h_conv3 = tf.nn.relu(self.batch_norm((self.conv2d(h_conv2, W_conv3, 1) + b_conv3), 64, self.is_training, 'CNN'))
            #    h_pool3 = self.max_pool_2x2(h_conv3)
                print('h_conv3.shape : ', h_conv3.shape)
                flatten = tf.reshape(h_conv3, [-1, 11*11*64])
            # 11*11*64
            with tf.variable_scope('fc1'):
                W_adv1 = self._weight_variables([11*11*64, 512], name = 'w_adv1')
                b_adv1 = self._bias_variables([512], name = 'b_adv1')
                h_adv1 = tf.nn.relu(tf.matmul(flatten, W_adv1) + b_adv1)

                W_val1 = self._weight_variables([11*11*64, 512], name = 'w_val1')
                b_val1 = self._bias_variables([512], name = 'b_val1')
                h_val1 = tf.nn.relu(tf.matmul(flatten, W_val1) + b_val1)

            with tf.variable_scope('output'):
                W_adv2 = self._weight_variables([512, 4], name = 'w_adv2')
                b_adv2 = self._bias_variables([4], name = 'b_adv2')
                target_advantage = tf.matmul(h_adv1, W_adv2) + b_adv2

                W_val2 = self._weight_variables([512, 1], name = 'w_val2')
                b_val2 = self._bias_variables([1], name = 'b_val2')
                target_value = tf.matmul(h_val1, W_val2) + b_val2        
                self.q_next = target_value + (target_advantage - tf.reduce_mean(target_advantage, axis=1, keep_dims = True))

        with tf.variable_scope('q_target'):
            #q_target = tf.cond(self.terminal, lambda : self.reward + self.gamma * tf.reduce_max(self.q_next, axis = 1, name = 'Qmax_s_'), lambda: self.reward)
            q_target = self.reward + self.gamma * (1 - self.terminal) * tf.reduce_max(self.q_next, axis = 1, name = 'Qmax_s_')
            self.q_target = tf.stop_gradient(q_target)
        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([tf.range(tf.shape(self.action)[0], dtype = tf.int32), self.action], axis=1)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)
        with tf.variable_scope('loss'):
            #self.delta = (self.q_target - self.q_eval_wrt_a)
            #clipped_error = tf.where(tf.abs(self.delta) < 1.0,
            #                        0.5 * tf.square(self.delta),
            #                        tf.abs(self.delta) - 0.5, name='clipped_error')
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
            #self.loss = tf.reduce_mean(clipped_error, name = 'loss')
            tf.summary.scalar('loss', self.loss)
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)



    def init_game_setting(self):
        """

        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary

        """
        ##################
        # YOUR CODE HERE #
        ##################
        pass
    def test(self, total_episodes=100, seed = 11037):
        rewards = []
        self.env.seed(seed)
        for i in range(total_episodes):
            state = self.env.reset()
            self.init_game_setting()
            done = False
            episode_reward = 0.0

            #playing one game
            while(not done):
                action = self.make_action(state, test=True)
                state, reward, done, info = self.env.step(action)
                episode_reward += reward

            rewards.append(episode_reward)
        print('Run %d episodes'%(total_episodes))
        print('Mean:', np.mean(rewards))

    def train(self):
        """
        Implement your training algorithm here
        """
        ##################
        # YOUR CODE HERE #
        ##################

        saver = tf.train.Saver()
        rewards = []
        ep_t = 0
        for i_episode in range(MAX_EPOCH):
            observation = self.env.reset()
            ep_reward = 0 
            for t in range(MAX_STEPS):
                if i_episode % 500 == 0 and i_episode != 0:
                    print('\n============================\n')
                    self.test()
                    print('\n============================\n')
                    break
                else:
                    action = self.make_action(observation, test = False)
                    observation_, reward, done, info = self.env.step(action)
                    self.memory.store(observation, action, reward, done)
           
                    #print('shape of observation : ', observation.shape)
                    observation = observation_
                    ep_reward += reward
                    if self.epsilon > 0.05:
                        self.epsilon -= (1 - 0.05)/1000000
                    else:
                        self.epsilon = 0.05

                    ep_t += 1

                    if ep_t % self.replace_target_iteration == 0:
                        self.sess.run(self.target_replace_op)
                        print('\n target parameter replaced : ', str(ep_t / self.replace_target_iteration), '\n')
                        save_path = saver.save(self.sess, "Model_deuling_2/model_deuling_dqn_2.ckpt")
                if ep_t % 4 == 0 and ep_t > self.batch_size:
                    if self.memory.current >= self.memory_size:
                        s, a, r, ter, s_ = self.memory.sample_memory(np.random.choice(self.memory_size - 1, size=self.batch_size))
                    else:
                        s, a, r, ter, s_ = self.memory.sample_memory(np.random.choice(self.memory.current - 1, size=self.batch_size))

                    _, loss, summ, q_value = self.sess.run(
                        [self._train_op, self.loss, self.summaries, self.q_eval],
                        feed_dict={
                            self.state : s,
                            self.action : a,
                            self.reward : r,
                            self.state_ : s_,
                            self.terminal : ter,
                            self.is_training : True,
                            self.keep_prob : 1.0,
                        })
                    self.writer.add_summary(summ, global_step=i_episode)
                    self.learn_step_counter += 1
                    #print('learn_step_counter : ', self.learn_step_counter, '\t loss : ', loss)
                    #print('q_value : ', q_value[0])    
                if done:
                    break
                #total_reward = tf.constant(ep_reward, name = 'r')
                #tf.summary.scalar('reward', total_reward)
                #print('epoch : ', i_episode)
                #summ = self.sess.run(self.summaries)
                #self.writer.add_summary(summ, global_step=i_episode)
            rewards.append(ep_reward)
            np.save('dueling_dqn_reward.npy', np.array(rewards))
            if ep_t > self.batch_size :
                print('q_value : ', q_value[0])    
            print('epoch : ', i_episode, '\ttotal_reward : ', ep_reward, '\taverage_reward : ', np.mean(rewards), '\teplison : ', self.epsilon)
            print('ep_t : ', ep_t, "\tEpisode finished after {} timesteps".format(t+1))
                           
            
                
    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent

        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)

        Return:
            action: int
                the predicted action from trained model
        """
        ##################
        # YOUR CODE HERE #
        ##################
        norm_observation = (observation)[np.newaxis, :]
        if not test:
            if np.random.uniform(0, 1) < self.epsilon:
                action = self.env.get_random_action()
            else:
                action_value = self.sess.run(self.q_eval, feed_dict={self.state: norm_observation, self.is_training : True, self.keep_prob : 1.0})
                action = np.argmax(action_value)
        else:
            if np.random.uniform(0, 1) < 0.01:
                action = self.env.get_random_action()
            else:
                action_value = self.sess.run(self.q_eval, feed_dict={self.state: norm_observation, self.is_training : False, self.keep_prob : 1.0})
                action = np.argmax(action_value)
        return action
def trainD(file_name="Distral_1col",
           list_of_envs=[GridworldEnv(4), GridworldEnv(5)],
           batch_size=128,
           gamma=0.999,
           alpha=0.9,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(num_actions)
    models = [DQN(num_actions)
              for _ in range(0, num_envs)]  ### Add torch.nn.ModuleList (?)
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))

            # last_screen = env.current_grid_map
            current_screen = get_screen(env)
            state = current_screen  # - last_screen
            # Select and perform an action
            action = select_action(state, policy, models[i_env], num_actions,
                                   eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)
            steps_done[i_env] += 1
            current_time[i_env] += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)
            if done:
                print(
                    "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:",
                    env.episode_total_reward, "\tit:", current_time[i_env],
                    "\texp_factor:", eps_end + (eps_start - eps_end) *
                    math.exp(-1. * episodes_done[i_env] / eps_decay))
                env.reset()
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
def trainD(file_name="Distral_2col_SQL",
           list_of_envs=[GridworldEnv(5),
                         GridworldEnv(4),
                         GridworldEnv(6)],
           batch_size=128,
           gamma=0.999,
           alpha=0.8,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Returns rewards and durations logs.
    """
    num_actions = list_of_envs[0].action_space.n
    input_size = list_of_envs[0].observation_space.shape[0]
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(input_size, num_actions)
    models = [DQN(input_size, num_actions) for _ in range(0, num_envs)]
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    states = []
    for env in list_of_envs:
        states.append(
            torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
                -1, input_size))

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alternating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):

            # select an action
            action = select_action(states[i_env], policy, models[i_env],
                                   num_actions, eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)

            steps_done[i_env] += 1
            current_time[i_env] += 1
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(states[i_env], action, next_state, reward,
                                 time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)

            # Update state
            states[i_env] = next_state

            # Check if agent reached target
            if done or current_time[i_env] >= max_num_steps_per_episode:
                if episodes_done[i_env] <= num_episodes:
                    print(
                        "ENV:", i_env, "iter:", episodes_done[i_env],
                        "\treward:{0:.2f}".format(env.episode_total_reward),
                        "\tit:", current_time[i_env], "\texp_factor:",
                        eps_end + (eps_start - eps_end) *
                        math.exp(-1. * episodes_done[i_env] / eps_decay))

                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy(env.reset()).type(
                    torch.FloatTensor).view(-1, input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        # Perform one step of the optimization on the Distilled policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()

    ## Store Results
    np.save(file_name + '-rewards', episode_rewards)
    np.save(file_name + '-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
Ejemplo n.º 9
0
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
            GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    # action dimension
    num_actions = list_of_envs[0].action_space.n
    # total envs
    num_envs = len(list_of_envs)
    # pi_0
    policy = PolicyNetwork(num_actions)
    # Q value, every environment has one, used to calculate A_i,
    models = [DQN(num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    # replay buffer for env ???
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # device = "cpu"
    print(device)
    # model
    policy = policy.to(device)
    for i in range(len(models)):
        models[i] = models[i].to(device)

    # optimizer for every Q model
    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    # optimizer for policy
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    # info list for each environment
    episode_durations = [[] for _ in range(num_envs)]   # list of local steps
    episode_rewards = [[] for _ in range(num_envs)]     # list of list of episode reward

    episodes_done = np.zeros(num_envs)      # episode num
    steps_done = np.zeros(num_envs)         # global timesteps for each env
    current_time = np.zeros(num_envs)       # local timesteps for each env

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        policy.train()
        for model in models:
            model.train()

        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        #   1. do the step for each env
        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))
        
            # last_screen = env.current_grid_map
            # ===========update step info begin========================
            current_screen = get_screen(env)
            # state
            state = current_screen # - last_screen
            # action chosen by pi_1~pi_i
            action = select_action(state, policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta, device)
            # global_steps
            steps_done[i_env] += 1
            # local steps
            current_time[i_env] += 1
            # reward
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # next state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen # - last_screen
            else:
                next_state = None

            # add to buffer
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            #   2. do one optimization step for each env using "soft-q-learning".
            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma, device)
            # ===========update step info end ========================


            # ===========update episode info begin ====================
            if done:
                print("ENV:", i_env, "iter:", episodes_done[i_env],
                    "\treward:", env.episode_total_reward,
                    "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                    (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))
                # reset env
                env.reset()
                # episode steps
                episodes_done[i_env] += 1
                # append each episode local timesteps list for every env
                episode_durations[i_env].append(current_time[i_env])
                # reset local timesteps
                current_time[i_env] = 0
                # append total episode_reward to list
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)
            # ===========update episode info end ====================

        #   3. do one optimization step for the policy
        # after all envs has performed one step, optimize policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, device)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(5),
            GridworldEnv(4), GridworldEnv(6)], batch_size=128, gamma=0.999, alpha=0.8,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    input_size = list_of_envs[0].observation_space.shape[0]
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(input_size, num_actions)
    models = [DQN(input_size,num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    distilled_logits_magnitude = np.zeros((num_episodes,num_envs))
    policy_logits_magnitude = np.zeros((num_episodes,num_envs))
    # keep track of num of times a random action is picked
    num_rand = np.zeros(num_envs)

    # Initialize environments
    states = []
    for env in list_of_envs:
        states.append(torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size))

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):
        
            # select an action
            action, pi_0_norm, pi_i_norm = select_action(states[i_env], policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta)

            if episodes_done[i_env] < num_episodes:
                if pi_0_norm + pi_i_norm == 0:
                    num_rand[i_env] += 1
                else:
                    distilled_logits_magnitude[int(episodes_done[i_env]), i_env] += pi_0_norm
                    policy_logits_magnitude[int(episodes_done[i_env]), i_env] += pi_i_norm

            steps_done[i_env] += 1
            current_time[i_env] += 1
            next_state_tmp, reward, done, _ = env.step(action[0,0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size)

            if done:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(states[i_env], action, next_state, reward, time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma)

            # Update state
            states[i_env] = next_state

            # Check if agent reached target
            if done or current_time[i_env] >= max_num_steps_per_episode:
                if episodes_done[i_env] <= num_episodes:
                    print("ENV:", i_env, "iter:", episodes_done[i_env],
                        "\treward:{0:.2f}".format(env.episode_total_reward),
                        "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                        (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))


                if episodes_done[i_env] < num_episodes:
                    # average the cumulative norms
                    distilled_logits_magnitude[int(episodes_done[i_env]), i_env] /= (current_time[i_env] - num_rand[i_env])
                    policy_logits_magnitude[int(episodes_done[i_env]), i_env] /= (current_time[i_env] -num_rand[i_env])
                    num_rand[i_env] = 0

                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)


        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)
    np.save(file_name + '-beta-distilled_logit_norms', distilled_logits_magnitude)
    np.save(file_name + '-beta-policy_logit_norms', policy_logits_magnitude)

    return models, policy, episode_rewards, episode_durations