コード例 #1
0
ファイル: agent_random.py プロジェクト: zencoding/dex
class Agent:
    def __init__(self, args, state_dim, action_dim):
        self.h = args.hyper
        self.mode = 'observe'
        self.args = args
        self.metrics = Metrics()
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.memory = Memory(self.h.memory_size)
        self.run_count = -1
        self.replay_count = -1
        self.save_iterator = -1
        self.update_iterator = -1

        if self.args.directory == 'default':
            self.args.directory = G.CUR_FOLDER

        results_location = G.RESULT_FOLDER_FULL + '/' + self.args.directory
        data_location = G.DATA_FOLDER_FULL + '/' + self.args.directory
        os.makedirs(results_location,
                    exist_ok=True)  # Generates results folder
        os.makedirs(data_location, exist_ok=True)  # Generates data folder
        self.results_location = results_location + '/'
        self.data_location = data_location + '/'

    def act(self, s):
        return random.randrange(0, self.action_dim)

    def observe(self, sample):
        self.memory.add(sample)

    def replay(self, debug=False):
        pass
コード例 #2
0
ファイル: tests.py プロジェクト: mauriciobarg/proj-final-prog
 def test_sampling(self):
     mem = Memory(10)
     self.assertEqual(len(mem), 0)
     for i in range(10):
         mem.add(random())
     self.assertEqual(len(mem), 10)
     sample = mem.sample(5)
     self.assertEqual(len(sample), 5)
コード例 #3
0
def main():
    clock = pygame.time.Clock()

    N_EPOCHS = 1000
    GAMMA = 0.99
    N_BIRD = 64
    S_BATCH = 256

    env = FlappyBird(N_BIRD)

    main_model = Model()
    target_model = Model()

    memory = Memory()
    agent = Agent()

    for epoch in range(1, N_EPOCHS + 1):
        print('Epoch: {}'.format(epoch))

        env.reset()
        states, rewards, finished = env.random_step()
        target_model.model.set_weights(main_model.model.get_weights())

        running = True
        while running:
            clock.tick(60)

            actions = []
            for state in states:
                actions.append(agent.get_action(state, epoch, main_model))

            next_states, rewards, finished = env.step(actions)
            for state, reward, action, next_state in zip(
                    states, rewards, actions, next_states):
                memory.add((state, action, reward, next_state))

            states = next_states

            if len(memory.buffer) % S_BATCH == 0:
                main_model.replay(memory, env.n_bird, GAMMA, target_model)

            target_model.model.set_weights(main_model.model.get_weights())

            if not len(env.birds):
                running = False
                break

            env.draw()

            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    running = False

        print('\tScore: {}'.format(env.score))

    pygame.quit()

    env = FlappyBird()
コード例 #4
0
ファイル: randomAgent.py プロジェクト: EduBic/DQN-in-CartPole
class RandomAgent:
    def __init__(self, actionsCount, memory_capacity):
        self.memory = Memory(memory_capacity)
        self.actionsCount = actionsCount

    def act(self, state):
        return random.randint(0, self.actionsCount - 1)

    def observe(self, sample):  # Sample = (s, a, r, s')
        self.memory.add(sample)

    def replay(self):
        pass
コード例 #5
0
class Agent:
    def __init__(self, num_states, num_actions, eps_min=0.05, eps_max=1, lam=1e-3):
        self.num_states = num_states
        self.num_actions = num_actions
        self.eps_min = eps_min
        self.eps_max = eps_max
        self.lam = lam
        self.brain = Brain(num_states, num_actions)
        self.memory = Memory(MEMORY_CAPACITY)
        self.step = 0

    def act(self, s):
        if random.random() < self.eps:
            return random.randint(0, self.num_actions - 1)
        else:
            return np.argmax(self.brain.predict_one(s))

    def observe(self, sars_):
        '''
        takes in a sample of the environment, (s, a, r, s_),
        and adds it to the memory replay
        '''
        self.step += 1
        self.memory.add(sars_)

    def replay(self):
        batch = self.memory.sample(BATCH_SIZE)

        states = batch[0]
        actions = batch[1]
        rewards = batch[2]
        states_ = batch[3]

        p = self.brain.predict(states)
        p_ = self.brain.predict(np.nan_to_num(states_))

        t = np.copy(p)
        t[:, actions] = rewards
        real_state = ~np.isnan(states_).any(axis=1)
        t[real_state,actions] += GAMMA * np.amax(p_, axis=1)[real_state]

        self.brain.train(states, t)

    @property
    def eps(self):
        return self.eps_min + (self.eps_max + self.eps_min) * np.exp(- self.lam  * self.step)
コード例 #6
0
ファイル: profile_train.py プロジェクト: rahenri/tetris-rl
def main():
    p = psutil.Process(os.getpid())

    mem_size = 10000000
    memory = Memory(mem_size)
    agent = train.NNAgent((10, 24), 6)
    info = {
        "board": np.zeros((24, 10), dtype=np.int8),
    }
    for _ in range(mem_size):
        memory.add(info, info, 0)
    for i in range(10000000000):
        start = time.time()
        agent.train(memory, 1 << 14)
        end = time.time()
        rss = p.memory_info().rss / 1024 / 1024
        duration = end - start
        print(f"{i}: Memory: {rss:.1f}GB, Duration (sec): {duration:.1f}")
コード例 #7
0
class Agent:
    def __init__(
        self,
        device,
        key,
        actor_model,
        n_step,
    ):

        self.DEVICE = device
        self.KEY = key

        # NEURAL MODEL
        self.actor_model = actor_model

        # MEMORY
        self.memory = Memory()

        # HYPERPARAMETERS
        self.N_STEP = n_step

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.DEVICE)

        self.actor_model.eval()
        with torch.no_grad():
            action, log_prob, _ = self.actor_model(state)
        self.actor_model.train()

        action = action.cpu().detach().numpy().item()
        log_prob = log_prob.cpu().detach().numpy().item()

        return action, log_prob

    def step(self, actor_state, critic_state, action, log_prob, reward):
        self.memory.add(actor_state, critic_state, action, log_prob, reward)
コード例 #8
0
ファイル: ddqn.py プロジェクト: mugoh/reinforcementLearning
class DoomDDdqN:
    """
        Deep Q Network model for doom.

        Parameters
        ----------
        lr: float
            Learning rate
        gamma: float
            Discounting factor for future rewards
        eps: float
            Explore-exploit tradeoff for agent actions
        min_eps: float
            Minimum value for epsilon
        max_eps: float
            Maxumum value for epsilon
        name: str, default = 'DoomDqNet'
            Variable for tf namescope
        state_size: list, default = [100, 120, 4]
            Shape of input stack
        max_tau: int
            Max C step in updating the target network
    """
    lr: int = 0.0002
    gamma: float = 0.99
    eps: float = 0.00005
    min_eps: float = 0.01
    max_eps: float = 1.0
    memory_size: int = 100000
    name: str = 'DoomDDQN'
    state_size: list = field(default_factory=get_state_size)
    action_size = 7
    max_tau: int = 10000

    def __post_init__(self):
        self.build_model()
        self.memory = Memory(self.memory_size)
        self.setup_writer()

    def build_model(self):
        """
            Builds the Networks to use in training
        """
        with tf.compat.v1.variable_scope(self.name, reuse=tf.AUTO_REUSE):

            self.inputs = tf.compat.v1.placeholder(
                tf.float32,
                (None, *self.state_size),
                name='inputs')

            self.ISweights = tf.compat.v1.placeholder(
                tf.float32, (None, 1), name='ISweights')
            self.actions = tf.compat.v1.placeholder(
                tf.float32, (None, self.action_size), name='actions')
            self.target_Q = tf.compat.v1.placeholder(
                tf.float32, (None), name='target')

            self.build_conv_net()

    def build_conv_net(self):
        """
            Creates the model's layers and variables
        """

        conv_one = tf.layers.conv2d(
            inputs=self.inputs,
            filters=32,
            strides=[4, 4],
            kernel_size=(8, 8),
            padding='valid',
            kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
            name='conv_one'
        )
        conv_one_out = tf.nn.relu(features=conv_one, name='conv_one_out')

        conv_two = tf.layers.conv2d(
            inputs=conv_one_out,
            filters=64,
            kernel_size=(4, 4),
            strides=(2, 2),
            padding='valid',
            kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
            name='conv_two'
        )
        conv_two_out = tf.nn.relu(
            features=conv_two,
            name='conv_two'
        )

        conv_three = tf.layers.conv2d(
            inputs=conv_two_out,
            filters=128,
            kernel_size=(4, 4),
            strides=(2, 2),
            padding='valid',
            kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
            name='conv_three'
        )
        conv_three_out = tf.nn.relu(features=conv_three, name='conv_three_out')

        flatten = tf.layers.flatten(conv_three_out)
        self.separate_to_streams(flatten)
        self.aggregate()

    def separate_to_streams(self, flatten):
        """
            Creates the Value(s) and Advantage(s, a) layers
        """
        value_fc = tf.layers.dense(
            inputs=flatten,
            activation=tf.nn.relu,
            units=512,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name='value_fc'
        )
        self.value = tf.layers.dense(
            inputs=value_fc,
            units=1,
            activation=None,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name='value'
        )

        advantg_fc = tf.layers.dense(
            inputs=flatten,
            activation=tf.nn.relu,
            units=512,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name='advantg_fc')
        self.advantg = tf.layers.dense(
            inputs=advantg_fc,
            activation=None,
            units=self.action_size,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name='advantage')

    def _dense(self, inputs, units, activation=None, name='', **kwargs):
        """
            Returns a tf dense layer of specified args
        """

        return tf.layers.dense(
            inputs=inputs,
            units=units,
            activation=activation,
            kernel_initializer=kwargs.get('initializer') or
            tf.contrib.layers.xavier_initializer(),
            name=name
        )

    def aggregate(self):
        """
            Defines output and loss
        """

        # Q(s, a):= V(s) + A(s,a) - 1/|A| * sum[A(s,a')]
        self.output = self.value + tf.subtract(
            self.advantg,
            tf.reduce_mean(self.advantg, axis=1, keepdims=True))
        # Predicted Q
        self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions))

        self.abs_errors = tf.abs(self.target_Q - self.Q)
        self.loss = tf.reduce_mean(
            self.ISweights *
            tf.squared_difference(self.target_Q, self.Q))
        self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

    def prepopulate(self, episodes=100000):
        """
            Creates random experiences to hold in memory
        """
        self.memory = Memory(self.memory_size)

        self.game, self.actions_choice = create_env()
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        state, stacked_frames = stack_frames(state, new_episode=True)
        for episode in range(episodes):
            action = np.random.choice(self.actions_choice.shape[0], size=1)[0]
            action = list(self.actions_choice[action])
            reward = self.game.make_action(action)
            done = self.game.is_episode_finished()
            print(f'Episode {episode}: {done}')

            if done:
                next_state = np.zeros(state.shape, dtype=np.int)
                self.memory + (state, action, reward, next_state, done)

                self.game.new_episode()
                state = self.game.get_state().screen_buffer
                state, stacked_frames = stack_frames(state, new_episode=True)
            else:
                next_state = self.game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(
                    next_state, stacked_frames)
                self.memory + (state, action, reward, next_state, done)
                state = next_state

    def setup_writer(self):
        """
            Sets up the tensorboard writer
        """
        self.writer = tf.compat.v1.summary.FileWriter(
            '/root/tensorboard/dddqn/1')
        tf.compat.v1.summary.scalar('Loss', self.loss)
        self.writer_op = tf.compat.v1.summary.merge_all()
        self.saver = tf.train.Saver()

    def predict_action(self, sess, state, decay_step):
        """
            Predicts the next action for the agent.

            Uses the value of epsilon to select a random value
            or action at argmax(Q[s, a])
        """
        explore_exploit_tradeoff = np.random.uniform()
        explore_prob = self.min_eps + \
            (self.max_eps - self.min_eps) * np.exp(-self.eps * decay_step)

        if explore_prob > explore_exploit_tradeoff:
            # Explore
            action = self.actions_choice[np.random.choice(
                self.actions_choice.shape[0], size=1)][0]
        else:
            # Exploit -> Estimate Q values state
            Qs = sess.run(
                self.output,
                feed_dict={
                    self.inputs: state.reshape((1, *state.shape))})
            # Best action
            choice = np.argmax(Qs)
            action = self.actions_choice[int(choice)]
        return list(action), explore_prob

    def update_target_graph(self):
        """
            Copies parameters of the DQN to the target network
        """
        from_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'DQNet')
        to_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'TargetNet')

        up_holder = [to_vars.assign(from_vars)
                     for from_vars, to_vars in zip(from_vars, to_vars)]
        return up_holder

    def train(self, episodes=5000, batch_size=64,
              max_steps=3000, training=True):
        """
            Trains the model
        """
        if training:
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                decay_step = 0
                tau = 0
                loss, acc = '', ''
                self.game.init()

                sess.run(self.update_target_graph())

                for episode in range(episodes):
                    step = 0
                    episode_rewards = []
                    self.game.new_episode()
                    state = self.game.get_state().screen_buffer
                    state, stacked_frames = stack_frames(
                        state, new_episode=True)

                    while step <= max_steps:
                        step += 1
                        tau += 1
                        decay_step += 1

                        action, explore_prob = self.predict_action(
                            sess, state, decay_step)
                        reward = self.game.make_action(action)
                        episode_rewards += [reward]
                        done = self.game.is_episode_finished()

                        if done:
                            next_state = np.zeros(
                                # (120, 140),
                                resolution,
                                dtype=np.int)
                            next_state, stacked_frames = stack_frames(
                                next_state, stacked_frames)
                            step = max_steps
                            total_reward = np.sum(episode_rewards)

                            print(f'Episode {episode}' +
                                  f'Total reward: {total_reward}' +
                                  f'loss: {loss}' +
                                  f'acc: {acc}' +
                                  f'Explore prob: {explore_prob}'
                                  )
                            exp = state, action, reward, next_state, done
                            self.memory.add(exp)
                        elif not done:
                            next_state = self.game.get_state().screen_buffer
                            next_state, stacked_frames = stack_frames(
                                next_state, stacked_frames)
                            self.memory + (state, action, reward,
                                           next_state, done)
                            state = next_state
                        loss, abs_err = self._learn(
                            sess, episode, batch_size)
                        print(f'Episode: {episode}, loss {loss}')
                        if tau > self.max_tau:
                            sess.run(self.update_target_graph())
                            tau = 0

                        self.save(sess, episode, interval=5)

    def _learn(self, sess, episode, batch_size):
        """
            Uses experiences stored in memory to get
            target Q values
        """
        mini_batches, tree_index = self.sample_experiences(batch_size)
        targets = self.get_target_Qs(sess, mini_batches)
        loss, abs_errs = self.find_loss(
            sess, targets, mini_batches)

        self.memory.update_priorities(tree_index, abs_errs)
        mini_batches.update({'targets': targets})
        # self.summarize(sess, episode, mini_batches)

        return loss, abs_errs

    def get_target_Qs(self, sess, mini_batch):
        """
            Sets the target_ Q as r for episodes ending at s + 1
            else, at r + gamma * max[Q(s',a')]
        """
        q_next_state = sess.run(self.output,
                                feed_dict={
                                    self.inputs: mini_batch.get('next_states')
                                })
        q_target_ns = sess.run(
            target_net.output,
            feed_dict={
                target_net.inputs: mini_batch.get('next_states')})

        target_Qs = []

        for i in range(mini_batch.get('batch_len')):
            terminal = mini_batch.get('dones')[i]
            action = np.argmax(q_next_state[i])
            rewards = mini_batch.get('rewards')[i]

            if terminal:
                target_Qs.append(rewards)
            else:
                target_Qs.append(rewards + self.gamma * q_target_ns[i][action])

        targets_mb = [m_b for m_b in target_Qs]

        return targets_mb

    def find_loss(self, sess, targets, mini_batches):
        """
            Finds difference between Q and targets
        """
        _, loss, err = sess.run(
            [self.optimizer, self.loss, self.abs_errors],
            feed_dict={self.inputs: mini_batches.get('states'),
                       self.target_Q: targets,
                       self.actions: mini_batches.get('actions'),
                       self.ISweights: mini_batches.get(
                'ISweights')
            })
        return loss, err

    def sample_experiences(self, batch_size):
        """
            Samples experience mini batches from memory
        """
        tree_index, batch, IS_weights = self.memory.sample(batch_size)
        states = self.__from_memory(batch, key=0, min_dims=3)
        actions = self.__from_memory(batch, 1)
        rewards = self.__from_memory(batch, 2)
        next_states = self.__from_memory(batch, 3, 3)
        dones = self.__from_memory(batch, 4)

        return {
            'states': states,
            'actions': actions,
            'rewards': rewards,
            'next_states': next_states,
            'dones': dones,
            'batch_len': len(batch),
            'ISweights': IS_weights
        }, tree_index

    def __from_memory(self, batch, key, min_dims=0):
        """
            Gives states, actions, rewards, as mini
            batches from a memory sample
        """
        f_key = 0
        m_b = np.array([m_bch[f_key][key]for m_bch in batch], ndmin=min_dims)

        return m_b

    def summarize(self, sess, episode, batches):
        """
            Writes tf summaries
        """
        summary = sess.run(
            self.writer_op,
            feed_dict={self.inputs: batches.get('states'),
                       self.target_Q: batches.get('targets'),
                       self.actions: batches.get('actions'),
                       self.ISweights: batches.get('ISweights')
                       })
        self.writer.add_summary(summary, episode)
        self.writer.flush()

    def save(self, sess, episode, interval):
        """
            Updates and saves the model
        """
        if not episode % interval:
            self.saver.save(sess, './models/dddqn.ckpt')

    def play(self, episodes=25):
        """
           Plays the trained agent
        """
        path = '/usr/local/lib/python3.7/dist-packages/vizdoom/scenarios/'

        with tf.compat.v1.Session() as sess:
            game, actions_choice = create_env(visible=True)
            game.load_config(os.path.join(path,
                                          'deadly_corridor.cfg'))
            game.set_doom_scenario_path(
                os.path.join(path, 'deadly_corridor.wad'))
            eps = .01

            self.saver.restore(sess, './models/dddqn.ckpt')
            game.init()
            total_score = []

            for i in range(episodes):
                game.new_episode()
                state = game.get_state().screen_buffer
                state, stacked_frames = stack_frames(state, new_episode=True)

                while not game.is_episode_finished():
                    tradeoff = np.random.randn()

                    if tradeoff > eps:

                        action = actions_choice[np.random.choice(
                            actions_choice.shape[0], size=1)][0]
                    else:
                        # Exploit -> Estimate Q values state
                        Qs = sess.run(
                            self.output,
                            feed_dict={
                                self.inputs: state.reshape((1, *state.shape))})
                        # Best action
                        choice = np.argmax(Qs)
                        action = self.actions_choice[int(choice)]
                    game.make_action(list(action))
                    done = game.is_episode_finished()

                    if not done:
                        next_state = game.get_state().screen_buffer
                        next_state, stacked_frames = stack_frames(
                            next_state, stacked_frames)
                        state = next_state
                    else:
                        break
                reward = game.get_total_reward()
                print(f'reward: {reward}')
                total_score.append(reward)
            print(f'\nScore: {np.sum(total_score) / episodes}')
            game.close()
コード例 #9
0
    action = possible_actions[randint(0, 3)]

    # Get rewards
    terminal, reward = game.perform_action(action)

    # Look if the episode is finished
    # done = game.is_episode_finished()

    # If episode ends
    if terminal:

        # episode finishes
        next_state = np.zeros(state.shape)

        # Add experience to memory
        memory.add((state, action, reward, next_state, terminal))

        # Start a new episode
        game.reset()

        # get a state
        state, color_frame = game.createImage()

        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)

    else:
        # Get next state
        next_state, color_frame = game.createImage()
        next_state, stacked_frames = stack_frames(stacked_frames, next_state,
                                                  False)
コード例 #10
0
ファイル: agent.py プロジェクト: yhcao6/DQN
class Agent(object):
    def __init__(self, sess):
        self.sess = sess
        # some config
        self.state_size = 4
        self.n_action = 2
        self.epsilon = 0.1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.epsilon_decay_rate = 0.001
        self.discount = 0.99
        self.steps = 0
        self.batch_size = 64
        self.lr = 0.00025

        self.memory = Memory()

        # build network
        self._build_network()

        self.loss_summary = tf.summary.scalar('loss', self.loss)

        self.writer = tf.summary.FileWriter('logs/', sess.graph)

        sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()

    def _build_network(self):
        self.w = {}

        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu

        with tf.variable_scope('prediction'):
            # input
            self.s = tf.placeholder(tf.float32, [None, self.state_size], name='s')

            # l1
            self.l1, self.w['l1_w'], self.w['l1_b'] = linear(self.s, 64, initializer, activation_fn, name='l1')

            # q
            self.q, self.w['q_w'], self.w['q_b'] = linear(self.l1, self.n_action, initializer, activation_fn=None, name='q')

        with tf.variable_scope('loss'):
            self.target_q = tf.placeholder(tf.float32, [None, 1], name='target_q')
            self.action = tf.placeholder('int64', [None, 1], name='action')

            action_one_hot = tf.one_hot(self.action, 2)[:, 0, :]
            q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted')
            q_acted = tf.reshape(q_acted, [-1, 1])

            self.loss = tf.losses.mean_squared_error(self.target_q, q_acted)

            self.optimizer = tf.train.RMSPropOptimizer(self.lr)

            self.train_op = self.optimizer.minimize(self.loss)

    def act(self, s):
        s = np.array(s)
        s = s[np.newaxis, ...]
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_action)
        else:
            return np.argmax(self.q.eval({self.s: s}), axis=1)[0]

    def observe(self, s, a, r, s_, terminal):
        self.memory.add(s, a, r, s_, terminal)

        self.steps += 1
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.epsilon_decay_rate * self.steps)

    def replay(self):
        s, a, r, s_, terminal = self.memory.sample(self.batch_size)

        q_ = self.q.eval({self.s: s_})
        max_q_ = np.max(q_, axis=1).reshape([-1, 1])
        target_q = (1 - terminal) * self.discount * max_q_

        self.write_loss, _ = self.sess.run([self.loss_summary, self.train_op], {self.s: s, self.target_q: target_q, self.action: a})

    def save(self, epsodes):
        self.saver.save(self.sess, 'save/cart_pole', global_step=epsodes)
コード例 #11
0
class DDPG:

    def __init__(self, sess, params):
        self.sess = sess
        self.__dict__.update(params)
        # create placeholders
        self.create_input_placeholders()
        # create actor/critic models
        self.actor = Actor(self.sess, self.inputs, **self.actor_params)
        self.critic = Critic(self.sess, self.inputs, **self.critic_params)
        self.noise_params = {k: np.array(list(map(float, v.split(","))))
                             for k, v in self.noise_params.items()}
        self.noise = Noise(**self.noise_params)
        self.ou_level = np.zeros(self.dimensions["u"])
        self.memory = Memory(self.n_mem_objects,
                             self.memory_size)

    def create_input_placeholders(self):
        self.inputs = {}
        with tf.name_scope("inputs"):
            for ip_name, dim in self.dimensions.items():
                self.inputs[ip_name] = tf.placeholder(tf.float32,
                                                      shape=(None, dim),
                                                      name=ip_name)
            self.inputs["g"] = tf.placeholder(tf.float32,
                                              shape=self.inputs["u"].shape,
                                              name="a_grad")
            self.inputs["p"] = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name="pred_q")

    def step(self, x, is_u_discrete, explore=True):
        x = x.reshape(-1, self.dimensions["x"])
        u = self.actor.predict(x)
        if explore:
            self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
            u = u + self.ou_level
        q = self.critic.predict(x, u)
        if is_u_discrete:
            return [np.argmax(u), u[0], q[0]]
        return [u[0], u, q[0]]

    def remember(self, experience):
        self.memory.add(experience)

    def train(self):
        # check if the memory contains enough experiences
        if self.memory.size < 3*self.b_size:
            return
        x, g, ag, u, r, nx, ng, t = self.get_batch()
        # for her transitions
        her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0]
        # print("{} of {} selected for HER transitions".
        # format(len(her_idxs), self.b_size))
        g[her_idxs] = ag[her_idxs]
        r[her_idxs] = 1
        t[her_idxs] = 1
        x = np.hstack([x, g])
        nx = np.hstack([nx, ng])
        nu = self.actor.predict_target(nx)
        tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t)
        self.critic.train(x, u, tq)
        grad = self.critic.get_action_grads(x, u)
        # print("Grads:\n", g)
        self.actor.train(x, grad)
        self.update_targets()

    def get_batch(self):
        return self.memory.sample(self.b_size)

    def update_targets(self):
        self.critic.update_target()
        self.actor.update_target()
コード例 #12
0
ファイル: agent.py プロジェクト: benoit-penelle/snake-master
class Agent:
    def __init__(self, input_shape, action_count, steps=0, model_path=None, learning_rate=None):
        if learning_rate is not None:
            SET_LEARNING_RATE(learning_rate)
        self.steps = steps
        self.epsilon = MAX_EPSILON if steps == 0 else self.__calc_epsilon(steps)
        self.brain = Brain(action_count, input_shape=input_shape, model_path=model_path)
        self.memory = Memory(MEMORY_CAPACITY)
        self.input_shape = input_shape
        self.action_count = action_count

    def act(self, s):
        action = -1
        if random.random() < self.epsilon:
            action = random.randint(0, self.action_count - 1)
        else:
            predictions = np.squeeze(self.brain.predict(s.astype(np.float32)))
            action = round(np.argmax(predictions))
            weight_sqrsum = 0
            for i in range(self.action_count):
                if predictions[i] < 0 or predictions[i] * 2 < predictions[action]:
                    predictions[i] = 0
                else:
                    weight_sqrsum += math.pow(predictions[i], 2)
            if weight_sqrsum != 0:
                dice = random.random() * weight_sqrsum
                weight_begin = 0
                for i in range(self.action_count):
                    if weight_begin < dice and dice < weight_begin + math.pow(predictions[i], 2):
                        action = i
                        break
                    else:
                        weight_begin = math.pow(predictions[i], 2)                
        return action

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)

    def __calc_epsilon(self, steps):
        return MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * steps)

    def replay(self, batch_size=BATCH_SIZE):
        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = self.__calc_epsilon(self.steps)

        batch = self.memory.sample(batch_size)
        batch_len = len(batch)

        no_state = np.zeros(self.input_shape)

        # CNTK: explicitly setting to float32
        states = np.array([o[0] for o in batch], dtype=np.float32)
        states_ = np.array([(no_state if o[3] is None else o[3]) for o in batch], dtype=np.float32)

        p = self.brain.predict(states)
        p_ = self.brain.predict(states_)

        # CNTK: explicitly setting to float32
        x = np.zeros((batch_len, *self.input_shape)).astype(np.float32)
        y = np.zeros((batch_len, self.action_count)).astype(np.float32)

        for i in range(batch_len):
            s, a, r, s_ = batch[i]

            # CNTK: [0] because of sequence dimension
            t = p[0][i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * np.amax(p_[0][i])

            x[i] = s
            y[i] = t

        self.brain.train(x, y)
コード例 #13
0
class Brain:
    train_queue = [ [], [], [], [], [] ]    # s, a, r, s', s' terminal mask
    lock_queue = threading.Lock()
    def __init__(self, agent, modelFunc=None):
        self.initialized = False
        self.finalized = False
        self.c = 0
        self.agent = agent
        self.state_dim = self.agent.state_dim
        self.action_dim = self.agent.action_dim
        self.gamma = self.agent.h.gamma
        self.n_step_return = self.agent.h.memory_size
        self.gamma_n = self.gamma ** self.n_step_return
        self.loss_v = self.agent.h.extra.loss_v
        self.loss_entropy = self.agent.h.extra.loss_entropy
        self.batch = self.agent.h.batch
        self.learning_rate = self.agent.h.learning_rate
        self.brain_memory_size = self.agent.args.hyper.extra.brain_memory_size
        
        
        self.env = self.agent.args.env
        self.metrics = self.agent.metrics
        
        self.brain_memory = Memory(self.brain_memory_size, self.state_dim, self.action_dim)
        if self.agent.args.data: # Load memory
            s, a, r, s_, t = loadMemory_direct('../data/' + self.agent.args.data + '/')
            self.brain_memory.add(s, a, r, s_, t)
            
            
        self.NONE_STATE = np.zeros(self.state_dim)
        
        self.visualization = agent.visualization
        self.model = self.create_model(modelFunc)
        
    def init_model(self):
        if self.initialized == True:
            return
        if self.visualization == False:
        #######################################
            self.session = tf.Session()
            K.set_session(self.session)
            K.manual_variable_initialization(True)
            self.graph = self.create_graph(self.model)
    
            self.session.run(tf.global_variables_initializer())
            self.default_graph = tf.get_default_graph()
            
        self.initialized = True
            #    # avoid modifications
        #######################################
    
    def init_vars(self):
        init_op = tf.global_variables_initializer()
        self.session.run(init_op)
        
    def finalize_model(self):
        if self.finalized == True:
            return
        self.default_graph.finalize()
        self.finalized = True
        #for layer in self.model.layers:
        #    weights = layer.get_weights()
        #    print(np.sum(np.sum(weights)))
            #c += 1
            #print(c)
            #print(np.sum(layer.get_weights()))
        
        

    def create_model(self, modelFunc=None):
        print(self.state_dim)
        print(self.action_dim)
        if not modelFunc:
            modelFunc = models.model_mid_default
        model = models.model_start(self.state_dim, self.action_dim, models.model_top_a3c, modelFunc, self.visualization)
        
        model._make_predict_function() # have to initialize before threading
        print("Finished building the model")
        print(model.summary())
        return model
        
    def create_graph(self, model):
        batch_size = None # = None
        state_dim = [batch_size] + self.state_dim
        print(state_dim)
        s_t = tf.placeholder(tf.float32, shape=(state_dim))
        a_t = tf.placeholder(tf.float32, shape=(batch_size, self.action_dim))
        r_t = tf.placeholder(tf.float32, shape=(batch_size, 1)) # Discounted Reward
        
        p, v = model(s_t)

        log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-6) # Negative, larger when action is less likely
        advantage = r_t - v

        loss_policy = - log_prob * tf.stop_gradient(advantage) # Pos if better than expected, Neg if bad
        loss_value  = self.loss_v * tf.square(advantage) # Positive # minimize value error
        entropy = self.loss_entropy * tf.reduce_sum(p * tf.log(p + 1e-6), axis=1, keep_dims=True) # Negative Value

        loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)

        optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-3)
        minimize = optimizer.minimize(loss_total)

        return s_t, a_t, r_t, minimize, loss_total, log_prob, loss_policy, loss_value, entropy
        
        
    def optimize_batch_full(self, reset=1, suppress=1): # Use for online learning
        if self.brain_memory.isFull != True:
            return
        
        idx = np.arange(0, self.brain_memory.max_size)
        
        self.optimize_batch_index(idx, 1, reset, suppress)
        

    def optimize_batch_full_multithread(self, reset=1, suppress=1): # Use for online learning
        if self.brain_memory.isFull != True:
            time.sleep(0)	# yield
            return
        
           
        idx = np.arange(0, self.brain_memory.max_size)
        
        self.optimize_batch_index_multithread(idx, 1, reset, suppress)
        

    def optimize_batch(self, batch_count=1, suppress=0): # Use for offline learning
        if self.brain_memory.isFull != True:
            time.sleep(0)	# yield
            return

        idx = self.brain_memory.sample(self.batch * batch_count)
        self.optimize_batch_index(idx, batch_count, suppress)
        
    def optimize_batch_index(self, idx, batch_count=1, reset=0, suppress=0):
        s  = self.brain_memory.s [idx, :]
        a  = self.brain_memory.a [idx, :]
        r  = np.copy(self.brain_memory.r [idx, :])
        s_ = self.brain_memory.s_[idx, :]
        t  = self.brain_memory.t [idx, :]
        
        if reset == 1:
            self.brain_memory.isFull = False
            self.brain_memory.size = 0
            
        self.optimize_batch_child(s, a, r, s_, t, batch_count, suppress)

    def optimize_batch_index_multithread(self, idx, batch_count=1, reset=1, suppress=0):
        with self.lock_queue:
            if self.brain_memory.isFull != True:
                return
             
            s  = np.copy(self.brain_memory.s [idx, :])
            a  = np.copy(self.brain_memory.a [idx, :])
            r  = np.copy(self.brain_memory.r [idx, :])
            s_ = np.copy(self.brain_memory.s_[idx, :])
            t  = np.copy(self.brain_memory.t [idx, :])
        
            if reset == 1:
                self.brain_memory.isFull = False
                self.brain_memory.size = 0
                
        self.c += 1
        self.optimize_batch_child(s, a, r, s_, t, batch_count, suppress)    
        
    def optimize_batch_child(self, s, a, r, s_, t, batch_count=1, suppress=0):
        s_t, a_t, r_t, minimize, loss_total, log_prob, loss_policy, loss_value, entropy = self.graph
        for i in range(batch_count):
            start = i * self.batch
            end = (i+1) * self.batch
            r[start:end] = r[start:end] + self.gamma_n * self.predict_v(s_[start:end]) * t[start:end] # set v to 0 where s_ is terminal state
            _, loss_current, log_current, loss_p_current, loss_v_current, entropy_current = self.session.run([minimize, loss_total, log_prob, loss_policy, loss_value, entropy], feed_dict={s_t: s[start:end], a_t: a[start:end], r_t: r[start:end]})    
            
            #self.metrics.a3c.update(loss_current, log_current, loss_p_current, loss_v_current, entropy_current)
            
            if i % 10 == 0 and suppress == 0:
                print('\r', 'Learning', '(', i, '/', batch_count, ')', end="")
        
        if suppress == 0:
            print('\r', 'Learning', '(', batch_count, '/', batch_count, ')')
        
    def train_augmented(self, s, a, r, s_):
        if self.env.problem == 'Hexagon':
            if s_ is None:
                self.train_push_all_augmented(data_aug.full_augment([[s, a, r, self.NONE_STATE, 0.]]))
            else:    
                self.train_push_all_augmented(data_aug.full_augment([[s, a, r, s_, 1.]]))
        else:
            if s_ is None:
                self.train_push_augmented([s, a, r, self.NONE_STATE, 0.])
            else:    
                self.train_push_augmented([s, a, r, s_, 1.])
        
    def train_push_all_augmented(self, frames):
        for frame in frames:
            self.train_push_augmented(frame)
    # TODO: t value is flipped for brain memory and agent memory... should be consistent. Not a bug however.
    def train_push_augmented(self, frame):
        a_cat = np.zeros(self.action_dim)
        a_cat[frame[1]] = 1

        with self.lock_queue:
            if self.brain_memory.isFull == True:
                time.sleep(0)
                return
            self.brain_memory.add_single(frame[0], a_cat, frame[2], frame[3], frame[4])      
        #self.train_queue.append([frame[0], a_cat, frame[2], frame[3], frame[4]])
        
    def predict(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)
            return p, v

    def predict_p(self, s):
        with self.default_graph.as_default():
            p, _ = self.model.predict(s)
            return p

    def predict_v(self, s):
        with self.default_graph.as_default():
            _, v = self.model.predict(s)
            return v
コード例 #14
0
ファイル: model.py プロジェクト: caozixuan/RL_Learning
class MADDPG(object):
    def __init__(self, n, state_global, action_global, gamma, memory_size):
        self.n = n
        self.gamma = gamma
        self.memory = Memory(memory_size)
        self.agents = [
            DDPGAgent(index, 1600, 400, 0.5, state_global, action_global)
            for index in range(0, n)
        ]

    def update_agent(self, sample, index):
        observations, actions, rewards, next_obs, dones = sample
        curr_agent = self.agents[index]
        curr_agent.critic_train.zero_grad()
        all_target_actions = []
        # 根据局部观测值输出动作目标网络的动作
        for i in range(0, self.n):
            action = curr_agent.Actor_target(next_obs[:, i])
            all_target_actions.append(action)
        action_target_all = torch.cat(all_target_actions,
                                      dim=0).to(device).reshape(
                                          actions.size()[0],
                                          actions.size()[1],
                                          actions.size()[2])
        target_vf_in = torch.cat((next_obs, action_target_all), dim=2)
        # 计算在目标网络下,基于贝尔曼方程得到当前情况的评价
        target_value = rewards[:,
                               index] + self.gamma * curr_agent.Critic_target(
                                   target_vf_in).squeeze(dim=1)
        vf_in = torch.cat((observations, actions), dim=2)
        actual_value = curr_agent.Critic(vf_in).squeeze(dim=1)
        # 计算针对Critic的损失函数
        vf_loss = curr_agent.loss_td(actual_value, target_value.detach())

        vf_loss.backward()
        curr_agent.critic_train.step()

        curr_agent.actor_train.zero_grad()
        curr_pol_out = curr_agent.Actor(observations[:, index])
        curr_pol_vf_in = curr_pol_out
        all_pol_acs = []
        for i in range(0, self.n):
            if i == index:
                all_pol_acs.append(curr_pol_vf_in)
            else:
                all_pol_acs.append(self.agents[i].Actor(
                    observations[:, i]).detach())
        vf_in = torch.cat(
            (observations, torch.cat(all_pol_acs, dim=0).to(device).reshape(
                actions.size()[0],
                actions.size()[1],
                actions.size()[2])),
            dim=2)
        # DDPG中针对Actor的损失函数
        pol_loss = -torch.mean(curr_agent.Critic(vf_in))
        pol_loss.backward()
        curr_agent.actor_train.step()

    def update(self, sample):
        for index in range(0, self.n):
            self.update_agent(sample, index)

    def update_all_agents(self):
        for agent in self.agents:
            soft_update(agent.Critic_target, agent.Critic, agent.tau)
            soft_update(agent.Actor_target, agent.Actor, agent.tau)

    def add_data(self, s, a, r, s_, done):
        self.memory.add(s, a, r, s_, done)

    def save_model(self, episode):
        for i in range(0, self.n):
            model_name_c = "Critic_Agent" + str(i) + "_" + str(episode) + ".pt"
            model_name_a = "Actor_Agent" + str(i) + "_" + str(episode) + ".pt"
            torch.save(self.agents[i].Critic_target,
                       'model_tag/' + model_name_c)
            torch.save(self.agents[i].Actor_target,
                       'model_tag/' + model_name_a)

    def load_model(self, episode):
        for i in range(0, self.n):
            model_name_c = "Critic_Agent" + str(i) + "_" + str(episode) + ".pt"
            model_name_a = "Actor_Agent" + str(i) + "_" + str(episode) + ".pt"
            self.agents[i].Critic_target = torch.load("model_tag/" +
                                                      model_name_c)
            self.agents[i].Critic = torch.load("model_tag/" + model_name_c)
            self.agents[i].Actor_target = torch.load("model_tag/" +
                                                     model_name_a)
            self.agents[i].Actor = torch.load("model_tag/" + model_name_a)
コード例 #15
0
ファイル: ddpg.py プロジェクト: superjax/NNOA
        action, q = sess.run([train_actor_output, train_critic_current_action], feed_dict={k: [[v]] for k, v in zip(state_placeholders, env_state)})

        action = action[0]

        action = action if testing else eta_noise.reflected_ou(action * np.array([1, 1, 0, 1]), theta=[.15, .15, .75, .15], sigma=[.10, .10, .10, .10], min=-1, max=1)

        assert action.shape == env.action_space.sample().shape, (action.shape, env.action_space.sample().shape)

        max_xvel = 20
        max_yvel = 8
        max_yawrate = 0.2
        max_altitude = 15
        action = np.clip(action, -1, 1) * np.array([max_xvel, max_yvel, max_yawrate, max_altitude / 4.0]) - np.array([0, 0, 0, max_altitude])

        env_next_state, env_reward, env_done, env_info = env.step(action)
        replay_buffer.add(env_state, env_reward, action, env_done, priority=300)

        env_state = env_next_state

        total_reward += env_reward

        if training:
            states_batch, action_batch, reward_batch, next_states_batch, done_batch, indexes = replay_buffer.sample(BATCH_SIZE, prioritized=True)

            feed = {
                action_placeholder: action_batch,
                reward_placeholder: reward_batch,
                done_placeholder: done_batch
            }

            feed.update({k: v for k, v in zip(state_placeholders, states_batch)})
コード例 #16
0
ファイル: agent2.py プロジェクト: arasdar/RL
class Agent():
    """Interacts with and learns from the environment (env)."""

    def __init__(self, s_size, a_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            s_size (int): dimension of each state (s)
            a_size (int): dimension of each action (a)
            random_seed (int): random seed
        """
        self.s_size = s_size
        self.a_size = a_size
        self.random_seed = random.seed(random_seed)

        # Q-Network
        self.q = Q(s_size, a_size, random_seed).to(device)
        self.q_target = Q(s_size, a_size, random_seed).to(device)
        self.optimizer = optim.Adam(self.q.parameters(), lr=LR)

        # Replay memory
        self.memory = Memory(a_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, s, a, r, s2, done):
        # Save/add experience in/to replay memory/buffer
        self.memory.add(s, a, r, s2, done)
        
        # Exploration vs exploitation
        # # Learn every UPDATE_EVERY time steps.
        # self.t_step = (self.t_step + 1) % UPDATE_EVERY
        # if self.t_step == 0:
        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > BATCH_SIZE:
            E = self.memory.sample() # E: expriences, e: exprience
            self.learn(E, GAMMA)

    def act(self, s, eps=0.):
        """Returns an action (a) for a given state (s) as the current policy (a).
        
        Params
        ======
            state (array_like): current state (s)
            eps (float): epsilon, for epsilon-greedy action (a) selection
        """
        s = torch.from_numpy(s).float().unsqueeze(0).to(device)
        self.q.eval()
        with torch.no_grad():
            a_values = self.q(s) # a_values: action_values
        self.q.train()

        # # Epsilon-greedy (eps) action (a) selection
        # if random.random() > eps:
        return np.argmax(a_values.cpu().data.numpy())
        # else:
        #     return random.choice(np.arange(self.a_size))

    def learn(self, E, gamma):
        """Update value parameters using given batch of experience (e) tuples.

        Params
        ======
            exprience (Tuple[torch.Tensor]): tuple of (state, action, reward, next_state, done) 
            e (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) 
            e (Tuple[torch.Tensor]): tuple of (s, a, r, s2, done) 
            gamma (float): discount factor
        """
        S, A, rewards, S2, dones = E

        # Get max predicted Q (values) for next states (S2) from target model
        Q2 = self.q_target(S2).detach().max(1)[0].unsqueeze(1)
        print(self.q_target(S2).detach().max(1)[0].unsqueeze(1))
        print(self.q_target(S2).detach().max(1)[0])
        print(self.q_target(S2).detach().max(1))
        print(self.q_target(S2).detach())
        print(self.q_target(S2))
        
        # Compute Q target for current states (S)
        Q = rewards + (gamma * Q2 * (1 - dones))

        # Get expected Q (values) from local model
        Q_ = self.q(S).gather(1, A)
        print(self.q(S).gather(1, A))
        print(self.q(S))
        
        # Compute loss
        #loss = F.mse_loss(Q_expected, Q_targets)
        loss = ((Q_ - Q)**2).mean()
        
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.q, self.q_target, GAMMA)                     

    def soft_update(self, local_model, target_model, gamma):
        """Soft update model parameters.
        θ_target = (1-γ)*θ_local + γ*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(((1-gamma)*local_param.data) + (gamma*target_param.data))
コード例 #17
0
                es_params[i] = updated_es_params[i]

        actor_steps = 0

        # evaluate noisy actor(s)
        outs = ray.get([
            workers[i].evaluate.remote(es_params[i],
                                       n_episodes=args.n_episodes,
                                       noise=a_noise)
            for i in range(args.n_noisy)
        ])

        for f, steps, transitions, last_reward in outs:

            for transition in transitions:
                memory.add(transition)

            actor_steps += steps
            prCyan('Noisy actor {} fitness:{}'.format(i, f))

        # evaluate all actors
        outs = ray.get([
            workers[i].evaluate.remote(params, n_episodes=args.n_episodes)
            for i, params in enumerate(es_params)
        ])

        for f, steps, transitions, last_reward in outs:

            for transition in transitions:
                memory.add(transition)
コード例 #18
0
class A2Cagent(nn.Module):
    def __init__(self):
        super(A2Cagent, self).__init__()

        self.A, self.C = Actor(), Critic()
        if USE_CUDA:
            self.A.cuda()
            self.C.cuda()

        self.opt = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)

        self.exp_buffer = Memory(EXP_BUFFER_MAX)

    def forward(self, x):
        a_p = self.A(x)
        v = self.C(x)
        return a_p, v

    def remember(self, x):
        self.exp_buffer.add(x)

    def get_adv(self, s, s_new, r_new):  # Gets A(s_t,a_t)
        adv = r_new  # reward
        adv += GAMMA * (self.C(s_new))  # value of next state
        adv += self.C(s)  # value of current state

        return adv

    def act(self, s, action=None):
        prob, v = self.forward(s)
        dist = Categorical(prob)
        if action is None:
            action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        return action, log_prob, entropy, v.squeeze()

    # Replay S states, A actions, R Rewards, Adv advantages
    def replay(self, batch_size):
        if self.exp_buffer.size < EXP_BUFFER_MIN: return 0, 0, 0
        S, S_next, R, Adv, log_p_old, A, _ = self.exp_buffer.get_batch(
            batch_size)

        self.opt.zero_grad()

        a, log_p, ent, v = self.act(S, A)

        # A2C loss function for actor
        p_loss_ratio = torch.exp(log_p - log_p_old)
        p_loss_1 = p_loss_ratio * Adv
        p_loss_2 = torch.clamp(p_loss_ratio, 1 - CLIP_RANGE,
                               1 + CLIP_RANGE) * Adv
        p_loss = -torch.min(p_loss_1, p_loss_2).mean()

        # MSE for Critic Loss
        v_loss = 0.5 * (R - v).pow(2).mean()
        ent = ent.mean()

        (p_loss + v_loss - BETA * ent).backward(retain_graph=True)
        nn.utils.clip_grad_norm_(self.parameters(), 5)
        self.opt.step()

        return p_loss, v_loss, ent
コード例 #19
0
class Agent(object):
    """
        Agente inteligente responsável por tomar as decisões inerentes ao reinforcement learning.
        Durante o treinamento, utiliza o protocolo epsilon-greedy.

        Argumentos:
            action_pool (dict): Ações que podem ser tomadas pelo agente. Cada ação possui um código numérico.

        Atributos:
            _action_pool (dict): Mapa de ações que pode ser tomadas ({'id':'ação'}).
            _last_action (str): Última ação tomada.
            _config (Configuration): Arquivo de configurações globais.
            _memory (Memory): Memória para armazenar as ações tomadas, mudanças de estado e recompensas. Os itens guardados na memporia tem o formato {'state': '', 'action': '', 'reward': '', 'next_state': ''}.
            _taken_actions (list(str)): Lista com todas as ações tomadas durante a execução.
            _weekdays_map (dict): Mapa com sigla de dias da semana para números.

        Métodos:

            take_action(model, environment, training, network, current_step, actions_taken)
                : O agente executa uma ação no ambiente seguindo o protocolo epsilon-greedy.

            reset()
                : Reinicia o agente.

            sample_memory()
                : Faz uma amostragem aleatória da memória do agente.

    """
    def __init__(self, action_pool={}):
        self._action_pool = action_pool
        self._last_action = None
        self._config = Configuration()
        self._memory = Memory(self._config.max_memory_size)
        self._taken_actions = []
        self._weekdays_map = {
            'MON': 1,
            'TUE': 2,
            'WED': 3,
            'THU': 4,
            'FRI': 5,
            'SAT': 6,
            'SUN': 7
        }

    def take_action(self, model, environment, training, network, current_step,
                    actions_taken):
        """
            O agente executa uma ação no ambiente seguindo o protocolo epsilon-greedy.
            Durante a etapa de treino, o protocolo é seguido. Durante a execução normal a ação tomada é gulosa.

            Parâmetros:
                model (OpenDssEngine): Motor do OpenDSS utilizado para a simulação.
                environment (Environment): Ambiente onde serão executadas as ações.
                training (bool): Indica se está no processo de treinamento ou não.
                network (Network): Rede utilizada para escolher a ação.
                current_step (int): Passo atual da simulação (qual minuto do dia).
                actions_taken (int): Quantas ações foram tomadas no passo atual.

            Erros:
                None

            Retorna:
                None

        """
        alpha = environment.get_base_learning_rate()
        gamma = environment.get_discount_factor()

        initial_state = deepcopy(model.get_state())
        initial_state_voltages = deepcopy(model.get_voltages())

        p = np.random.random()
        # Greedy
        if (training and p < environment.get_epsilon()):
            a = random.choice(list(self._action_pool.keys()))
            _a = self._action_pool[a]
            if _a:
                model.take_action(_a)
            new_state = deepcopy(model.get_state())
            new_state_voltages = deepcopy(model.get_voltages())
            reward = environment.calculate_reward(initial_state_voltages,
                                                  new_state_voltages, _a,
                                                  self._last_action)
            self._last_action = _a
            self._memory.add({
                'state':
                deepcopy(
                    initial_state.state_space_repr(
                        current_step, actions_taken,
                        self._weekdays_map[model.get_weekday()])),
                'action':
                deepcopy(a),
                'reward':
                deepcopy(reward),
                'next_state':
                deepcopy(
                    new_state.state_space_repr(
                        current_step, actions_taken,
                        self._weekdays_map[model.get_weekday()]))
            })
        # Optimal
        else:
            inputs = np.expand_dims(
                np.array(initial_state.state_space_repr(
                    current_step, actions_taken,
                    self._weekdays_map[model.get_weekday()]),
                         dtype=np.float32), 0)
            a = np.squeeze(np.argmax(network.model(inputs), axis=-1))
            _a = self._action_pool[int(a)]
            if _a:
                model.take_action(_a)

            self._taken_actions.append(_a)

            if training:
                new_state = deepcopy(model.get_state())
                new_state_voltages = deepcopy(model.get_voltages())
                reward = environment.calculate_reward(initial_state_voltages,
                                                      new_state_voltages, _a,
                                                      self._last_action)
                self._last_action = _a
                self._memory.add({
                    'state':
                    deepcopy(
                        initial_state.state_space_repr(
                            current_step, actions_taken,
                            self._weekdays_map[model.get_weekday()])),
                    'action':
                    deepcopy(a),
                    'reward':
                    deepcopy(reward),
                    'next_state':
                    deepcopy(
                        new_state.state_space_repr(
                            current_step, actions_taken,
                            self._weekdays_map[model.get_weekday()]))
                })

    def reset(self):
        """
            Apaga a memória, ações tomadas e última ação tomada do agente.

            Parâmetros:
                None

            Erros:
                None

            Retorna:
                None

        """
        self._last_action = None
        self._memory = Memory(self._config.max_memory_size)
        self._taken_actions = []

    def sample_memory(self):
        """
            Faz uma amostragem aleatória da memória do agente. 
            Função utilizada no treinamento da rede.

            Parâmetros:
                None

            Erros:
                None

            Retorna:
                Lista com itens aleatórios da memória.

        """
        return self._memory.sample(self._config.memory_batch_size)
コード例 #20
0
class MADDPG:
    def __init__(self, state_size, action_size, num_agents, config):
        ''' Constructs the multi-agent eco-system '''
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print(f'Using {self.device}')

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        self.ddpg_agents = [
            DDPGAgent(config['seed'] + idx, state_size, action_size,
                      num_agents, self.device, config)
            for idx in range(num_agents)
        ]

        self.update_every = config['update_every']
        self.update_iters = config['update_iterations']

        # Note: Could be replaced by parallel env batching
        seed = config['seed']
        self.batch_size = config['batch_size']
        self.memory = Memory(config['memory_size'], self.batch_size, seed)
        self.memory.to_device(self.device)

    def reset_noise(self):
        '''Resets the noise amplitude for each ddpg agent'''
        [agent.reset_noise() for agent in self.ddpg_agents]

    def act(self, states):
        ''' For each agent idx, select a_idx = policy_idx(o_idx) + noise '''
        actions = [
            self.ddpg_agents[idx].act(states[np.newaxis, idx]).squeeze(0)
            for idx in range(self.num_agents)
        ]
        return actions

    # Note: We need to add all the observations, otherwise we break the stationarity of the environment
    def remember(self, states, actions, rewards, next_states, dones):
        '''Populates the replay memory with new batch of data; observations of all agents'''
        self.memory.add(
            Experience(states, actions, rewards, next_states, dones))

    def step(self, timestep):
        '''Steps through each ddpg agent'''
        if len(self.memory
               ) > self.batch_size and timestep % self.update_every == 0:
            for _ in range(self.update_iters):
                for idx in range(self.num_agents):
                    states, actions, rewards, next_states, dones = self.memory.sample(
                    )

                    predicted_best_next_actions = torch.cat([
                        self.ddpg_agents[idx].target_actor(next_states[:,
                                                                       idx, :])
                        for idx in range(self.num_agents)
                    ],
                                                            dim=1)

                    predicted_best_current_actions = torch.cat([
                        self.ddpg_agents[idx].learnt_actor(states[:, idx, :])
                        for idx in range(self.num_agents)
                    ],
                                                               dim=1)

                    states = torch.cat(
                        [states[:, idx, :] for idx in range(self.num_agents)],
                        dim=1)
                    actions = torch.cat(
                        [actions[:, idx, :] for idx in range(self.num_agents)],
                        dim=1)
                    next_states = torch.cat([
                        next_states[:, idx, :]
                        for idx in range(self.num_agents)
                    ],
                                            dim=1)

                    self.ddpg_agents[idx].step(predicted_best_current_actions,
                                               predicted_best_next_actions,
                                               states, actions, rewards,
                                               next_states, dones)

    def save(self, actor_weights_path, critic_weights_path):
        [
            torch.save(self.ddpg_agents[idx].learnt_actor.state_dict(),
                       actor_weights_path + str(idx + 1) + '.pth')
            for idx in range(self.num_agents)
        ]
        [
            torch.save(self.ddpg_agents[idx].learnt_actor.state_dict(),
                       critic_weights_path + str(idx + 1) + '.pth')
            for idx in range(self.num_agents)
        ]
コード例 #21
0
class DQN:

    def __init__(self, env, params):
        self.env = env
        params.actions = env.actions()
        self.num_actions = env.actions()
        self.episodes = params.episodes
        self.steps = params.steps
        self.train_steps = params.train_steps
        self.update_freq = params.update_freq
        self.save_weights = params.save_weights
        self.history_length = params.history_length
        self.discount = params.discount
        self.eps = params.init_eps
        self.eps_delta = (params.init_eps - params.final_eps) / params.final_eps_frame
        self.replay_start_size = params.replay_start_size
        self.eps_endt = params.final_eps_frame
        self.random_starts = params.random_starts
        self.batch_size = params.batch_size
        self.ckpt_file = params.ckpt_dir+'/'+params.game

        self.global_step = tf.Variable(0, trainable=False)
        if params.lr_anneal:
            self.lr = tf.train.exponential_decay(params.lr, self.global_step, params.lr_anneal, 0.96, staircase=True)
        else:
            self.lr = params.lr

        self.buffer = Buffer(params)
        self.memory = Memory(params.size, self.batch_size)

        with tf.variable_scope("train") as self.train_scope:
            self.train_net = ConvNet(params, trainable=True)
        with tf.variable_scope("target") as self.target_scope:
            self.target_net = ConvNet(params, trainable=False)

        self.optimizer = tf.train.RMSPropOptimizer(self.lr, params.decay_rate, 0.0, self.eps)

        self.actions = tf.placeholder(tf.float32, [None, self.num_actions])
        self.q_target = tf.placeholder(tf.float32, [None])
        self.q_train = tf.reduce_max(tf.mul(self.train_net.y, self.actions), reduction_indices=1)
        self.diff = tf.sub(self.q_target, self.q_train)

        half = tf.constant(0.5)
        if params.clip_delta > 0:
            abs_diff = tf.abs(self.diff)
            clipped_diff = tf.clip_by_value(abs_diff, 0, 1)
            linear_part = abs_diff - clipped_diff
            quadratic_part = tf.square(clipped_diff)
            self.diff_square = tf.mul(half, tf.add(quadratic_part, linear_part))
        else:
            self.diff_square = tf.mul(half, tf.square(self.diff))

        if params.accumulator == 'sum':
            self.loss = tf.reduce_sum(self.diff_square)
        else:
            self.loss = tf.reduce_mean(self.diff_square)

        # backprop with RMS loss
        self.task = self.optimizer.minimize(self.loss, global_step=self.global_step)

    def randomRestart(self):
        self.env.restart()
        for _ in range(self.random_starts):
            action = rand.randrange(self.num_actions)
            reward = self.env.act(action)
            state = self.env.getScreen()
            terminal = self.env.isTerminal()
            self.buffer.add(state)

            if terminal:
                self.env.restart()

    def trainEps(self, train_step):
        if train_step < self.eps_endt:
            return self.eps - train_step * self.eps_delta
        else:
            return self.eps_endt

    def observe(self, exploration_rate):
        if rand.random() < exploration_rate:
            a = rand.randrange(self.num_actions)
        else:
            x = self.buffer.getInput()
            action_values = self.train_net.y.eval( feed_dict={ self.train_net.x: x } )
            a = np.argmax(action_values)
        
        state = self.buffer.getState()
        action = np.zeros(self.num_actions)
        action[a] = 1.0
        reward = self.env.act(a)
        screen = self.env.getScreen()
        self.buffer.add(screen)
        next_state = self.buffer.getState()
        terminal = self.env.isTerminal()

        self.memory.add(state, action, reward, next_state, terminal)
        
        return state, action, reward, next_state, terminal

    def doMinibatch(self, sess, successes, failures):
        batch = self.memory.getSample()
        state = np.array([batch[i][0] for i in range(self.batch_size)]).astype(np.float32)
        actions = np.array([batch[i][1] for i in range(self.batch_size)]).astype(np.float32)
        rewards = np.array([batch[i][2] for i in range(self.batch_size)]).astype(np.float32)
        successes += np.sum(rewards==1)
        failures += np.sum(rewards==-1)
        next_state = np.array([batch[i][3] for i in range(self.batch_size)]).astype(np.float32)
        terminals = np.array([batch[i][4] for i in range(self.batch_size)]).astype(np.float32)

        rewards = np.clip(rewards, -1.0, 1.0)

        q_target = self.target_net.y.eval( feed_dict={ self.target_net.x: next_state } )
        q_target_max = np.argmax(q_target, axis=1)
        q_target = rewards + ((1.0 - terminals) * (self.discount * q_target_max))

        (result, loss) = sess.run( [self.task, self.loss],
                                    feed_dict={ self.q_target: q_target,
                                                self.train_net.x: state,
                                                self.actions: actions } )

        return successes, failures, loss

    def play(self):
        self.randomRestart()
        self.env.restart()
        for i in xrange(self.episodes):
            terminal = False
            while not terminal:
                action, reward, screen, terminal = self.observe(self.eps)

    def copy_weights(self, sess):
        for key in self.train_net.weights.keys():
            t_key = 'target/' + key.split('/', 1)[1]
            sess.run(self.target_net.weights[t_key].assign(self.train_net.weights[key]))

    def save(self, saver, sess, step):
        saver.save(sess, self.ckpt_file, global_step=step)
        
    def restore(self, saver):
        ckpt = tf.train.get_checkpoint_state(self.ckpt_file)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
コード例 #22
0
ファイル: player.py プロジェクト: danielRM88/deepQ
class Player:
    def __init__(self, game):
        with open("config.yaml", 'r') as stream:
            try:
                config = yaml.load(stream)
            except yaml.YAMLError as exc:
                print(exc)

        self.batch_size = config['batch_size']
        self.learning_rate = config['learning_rate']
        self.memory_size = config['memory_size']
        self.gamma = config['gamma']
        self.epsilon = config['epsilon']
        self.explore_start = config['explore_start']
        self.explore_stop = config['explore_stop']
        self.decay_rate = config['decay_rate']
        self.decay_step = config['decay_step']
        self.total_episodes = config['total_episodes']
        self.max_steps = config['max_steps']

        self.env = retro.make(game=game)
        self.memory = Memory(max_size=self.memory_size)

        self.action_size = self.env.action_space.n
        self.state_size = [38, 42, 4]
        self.possible_actions = np.array(
            np.identity(self.action_size, dtype=int).tolist())
        self.possible_actions = list(
            itertools.product((0, 1), repeat=self.action_size))
        self.action_size = len(self.possible_actions)

        tf.reset_default_graph()
        self.myNN = MyNN(self.action_size, self.state_size, self.learning_rate)

    def init_memory(self):
        state = self.env.reset()
        stacked_frames = deque(
            [np.zeros((38, 42), dtype=np.int) for i in range(4)], maxlen=4)
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        for i in range(self.batch_size):
            choice = random.randint(1, len(self.possible_actions)) - 1
            # action = possible_actions[choice]
            action = np.zeros(512, dtype=np.int)
            action[choice] = 1
            next_state, reward, done, _ = self.env.step(action)
            next_state, stacked_frames = stack_frames(stacked_frames,
                                                      next_state, False)
            self.memory.add((state, action, reward, next_state, done))
            state = next_state

    def train(self, render=False):
        self.init_memory()
        state = self.env.reset()
        stacked_frames = deque(
            [np.zeros((38, 42), dtype=np.int) for i in range(4)], maxlen=4)
        state, stacked_frames = stack_frames(stacked_frames, state, True)

        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(tf.global_variables_initializer())

            total_rewards = 0
            episode = 0
            for episode in range(self.total_episodes):
                step = 0
                state = self.env.reset()
                state, stacked_frames = stack_frames(stacked_frames, state,
                                                     True)
                # episode += 1
                while step < self.max_steps:
                    a = datetime.now()
                    exp_exp_tradeoff = np.random.rand()
                    explore_probability = self.explore_stop + (
                        self.explore_start - self.explore_stop) * np.exp(
                            -self.decay_rate * self.decay_step)

                    if (explore_probability > exp_exp_tradeoff):
                        choice = random.randint(1, len(
                            self.possible_actions)) - 1
                        action = self.possible_actions[choice]
                    else:
                        Qs = session.run(self.myNN.output,
                                         feed_dict={
                                             self.myNN.input:
                                             state.reshape((1, *state.shape))
                                         })
                        choice = np.argmax(Qs)
                        action = self.possible_actions[choice]

                    batch = self.memory.sample(self.batch_size)
                    target_Qs_batch = []
                    memory_states = []
                    memory_actions = []
                    memory_rewards = []
                    memory_next_states = []
                    memory_dones = []
                    for m in batch:
                        memory_states.append(m[0])
                        memory_actions.append(m[1])
                        memory_rewards.append(m[2])
                        memory_next_states.append(m[3])
                        memory_dones.append(m[4])

                    nextQs = session.run(
                        self.myNN.output,
                        feed_dict={self.myNN.input: memory_next_states})
                    for i in range(0, self.batch_size):
                        if batch[i][4]:
                            target_Qs_batch.append(batch[i][2])
                        else:
                            target_Qs_batch.append(batch[i][2] + self.gamma *
                                                   np.max(nextQs[i]))
                    target_Qs_batch = np.array(
                        [each for each in target_Qs_batch])

                    loss, _ = session.run(
                        [self.myNN.loss, self.myNN.optimizer],
                        feed_dict={
                            self.myNN.input: memory_states,
                            self.myNN.target_Q: target_Qs_batch,
                            self.myNN.actions: memory_actions
                        })

                    next_state, reward, done, _ = self.env.step(action)
                    total_rewards += reward
                    next_state, stacked_frames = stack_frames(
                        stacked_frames, next_state, False)

                    if (render):
                        self.env.render()

                    current_action = action
                    action = np.zeros(512, dtype=np.int)
                    action[choice] = 1
                    self.memory.add((state, action, reward, next_state, done))
                    if done:
                        next_state = np.zeros((38, 42), dtype=np.int)
                        next_state, stacked_frames = stack_frames(
                            stacked_frames, next_state, False)
                        self.memory.add(
                            (state, action, reward, next_state, done))
                        break

                    self.decay_step += 1
                    step += 1
                    state = next_state
                    b = datetime.now()
                    os.system('clear')
                    print("episode: ")
                    print(episode)
                    print("step: ")
                    print(step)
                    print("action: ")
                    print(current_action)
                    print("total_rewards: ")
                    print(total_rewards)
                    print("loss: ")
                    print(loss)
                    print("decay_step: ")
                    print(self.decay_step)
                    print("explore_probability: ")
                    print(explore_probability)
                    print("step time (seconds): ")
                    print((b - a).total_seconds())

                if episode % 5 == 0:
                    save_path = saver.save(session, "./models/model.ckpt")
                    print("Model Saved")

    def play(self, model_path=None):
        with tf.Session() as sess:
            total_test_rewards = []

            saver = tf.train.Saver()
            # Load the model
            if (model_path == None):
                saver.restore(sess, "./models/model.ckpt")
            else:
                saver.restore(sess, model_path)

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                stacked_frames = deque(
                    [np.zeros((38, 42), dtype=np.int) for i in range(4)],
                    maxlen=4)
                state, stacked_frames = stack_frames(stacked_frames, state,
                                                     True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    state = state.reshape((1, *self.state_size))
                    Qs = sess.run(self.myNN.output,
                                  feed_dict={self.myNN.input: state})

                    choice = np.argmax(Qs)
                    action = self.possible_actions[choice]

                    next_state, reward, done, _ = self.env.step(action)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, stacked_frames = stack_frames(
                        stacked_frames, next_state, False)
                    state = next_state

            self.env.close()
コード例 #23
0
ファイル: agent.py プロジェクト: wxw0/rl
class DQN(object):
    def __init__(self, config, sess):
        self.cf = config
        self.sess = sess
        self.env = Env(self.cf)
        self.eval_env = Env(self.cf)

        self.mainQnet = mainQnet(self.cf,
                                 action_n=self.env.action_n,
                                 scope='mainQnet')
        self.targetQnet = Qnet(self.cf,
                               action_n=self.env.action_n,
                               scope='targetQnet')

        main_vars = tf.trainable_variables('mainQnet')
        target_vars = tf.trainable_variables('targetQnet')
        self.update_targetQnet_ops = []
        for v, tv in zip(main_vars, target_vars):
            self.update_targetQnet_ops.append(tv.assign(v))

        self.model_dir = self.cf.model_dir
        self.saver = tf.train.Saver(max_to_keep=1)

    def predict_a(self, state):
        net = self.mainQnet
        a, Qout = self.sess.run([net.predict, net.Qout], {net.input: state})
        # print('predict_a:', a, Qout)
        return a

    def train_mainQnet(self, step):
        pre_state, action, reward, done, post_state = self.memory.sample()
        targetQout = self.sess.run(self.targetQnet.Qout,
                                   {self.targetQnet.input: post_state})
        targetQmax = np.max(targetQout, axis=1)
        # print('targetQout:', targetQout, targetQout.shape)
        # print('targetQmax:', targetQmax, targetQmax.shape)

        # print('done: ', 1. - done)
        targetQ = (1. - done) * self.cf.discount * targetQmax + reward
        # print('targetQ: ', targetQ, targetQ.shape)

        net = self.mainQnet
        run_ops = [net.trainer, net.grad_norm, net.q_loss]
        results = self.sess.run(run_ops, {
            net.input: pre_state,
            net.action: action,
            net.targetQ: targetQ
        })

        # if results[1] > self.cf.max_grad_norm:
        # if True:
        #     print(*results[1:])

        for i in results[1:]:
            assert not np.isnan(i)
        self.mgn_avg.append(results[1])
        self.q_loss_avg.append(results[2])

    def update_targetQnet(self):
        # print('update_targetQnet...\n')
        self.sess.run(self.update_targetQnet_ops)

    def get_action(self, step):
        if step < self.cf.memory_start_size:
            return self.env.sample_action()

        if self.explore > self.cf.final_explore:
            self.explore -= self.explore_descend
        else:
            self.explore = self.cf.final_explore

        if random.random() > self.explore:
            action = self.predict_a(self.env.recent_states)[0]
            # print('predict_a:', action)
        else:
            action = self.env.sample_action()
            # print('sample_action:', action)
        return action

    def learn(self):
        self.memory = Memory(self.cf)
        self.explore = self.cf.init_explore
        self.explore_descend = (self.cf.init_explore - self.cf.final_explore
                                ) / self.cf.final_explore_step

        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        self.summary_dir = self.cf.summary_dir

        self.summary_write = tf.summary.FileWriter(self.summary_dir,
                                                   self.sess.graph)
        self.summary_write.flush()

        self.episode_r_summary = tf.Variable(0., trainable=False)
        er_op = tf.summary.scalar('r/episode_r_avg', self.episode_r_summary)
        self.eval_episode_r_summary = tf.Variable(0., trainable=False)
        eer_op = tf.summary.scalar('r/evaluate_episode_r_avg',
                                   self.eval_episode_r_summary)
        self.q_loss_summary = tf.Variable(0., trainable=False)
        ql_op = tf.summary.scalar('loss/q_loss_avg', self.q_loss_summary)
        self.mgn_summary = tf.Variable(0., trainable=False)
        mgn_op = tf.summary.scalar('loss/mgn_avg', self.mgn_summary)

        self.summary_op = tf.summary.merge([er_op, eer_op, ql_op, mgn_op])

        self.q_loss_avg = []
        self.mgn_avg = []

        print('\nLearning...\n')
        self.update_targetQnet()
        step = 0
        state = self.env.reset()
        done = False
        episode = 1
        episode_step = 0
        episode_reward = 0
        episodes_average = []

        best_score = -9999.
        while step < self.cf.total_step:
            step += 1
            episode_step += 1

            action = self.get_action(step)
            state_1, reward, done = self.env.act(action)
            # print('reward: ', reward, done)
            self.memory.add(state, action, np.sign(reward), done)

            episode_reward += reward
            state = state_1
            if done:
                if self.env.real_done:
                    episodes_average.append(episode_reward)
                    episode += 1
                    episode_step = 0
                    episode_reward = 0
                # self.memory.add(state, 0, 0, False)
                state = self.env.reset()
                done = False

            if step > self.cf.memory_start_size:
                if step % self.cf.train_frequency == 0:
                    self.train_mainQnet(step)
                if step % self.cf.update_frequency == 0:
                    self.update_targetQnet()
                if step % self.cf.evaluate_every_step == 0:

                    episode_r = np.array(episodes_average)
                    q_l_a = np.array(self.q_loss_avg)
                    mgn_a = np.array(self.mgn_avg)

                    eval_episode_r = self.evaluate()
                    summary_op = self.sess.run(
                        self.summary_op, {
                            self.episode_r_summary: episode_r.mean(),
                            self.eval_episode_r_summary: eval_episode_r.mean(),
                            self.mgn_summary: mgn_a.mean(),
                            self.q_loss_summary: q_l_a.mean()
                        })
                    self.summary_write.add_summary(summary_op,
                                                   global_step=step)
                    self.summary_write.flush()

                    episodes_average = []
                    self.q_loss_avg = []
                    self.mgn_avg = []

                    with open(self.summary_dir + 'r.csv', 'a') as f:
                        r_csv = str(time.time()) + ',' + str(step) + ',' + str(episode_r.mean()) + ',' + str(episode_r.std()) +\
                                ',' + str(eval_episode_r.mean()) + ',' + str(eval_episode_r.std()) +\
                                ',' + str(mgn_a.mean()) + ',' + str(mgn_a.std()) + \
                                ',' + str(q_l_a.mean()) + ',' + str(q_l_a.std()) + '\n'
                        print(r_csv)
                        f.write(r_csv)

                    if eval_episode_r.mean() > best_score:
                        best_score = eval_episode_r.mean()
                        self.saver.save(self.sess, self.model_dir + str(step))
        self.env.close()

    def get_action_for_evaluate(self):
        if random.random() > self.cf.evaluate_explore:
            action = self.predict_a(self.eval_env.recent_states)
        else:
            action = self.eval_env.sample_action()
        return action

    def evaluate(self, load_model=False):

        if load_model:
            print('\nEvaluate...')
            print('Loading Model...' + self.model_dir + '\n')
            ckpt_state = tf.train.get_checkpoint_state(self.model_dir)
            print('ckpt_state: ', ckpt_state.model_checkpoint_path)
            self.saver.restore(self.sess, ckpt_state.model_checkpoint_path)

        self.eval_env.reset()
        episode_step = 0
        episode_reward = 0
        episodes_average = []

        while len(episodes_average) < self.cf.evaluate_episodes:
            episode_step += 1
            action = self.get_action_for_evaluate()
            state, reward, done = self.eval_env.act(action, is_training=False)
            episode_reward += reward

            if done or episode_step > self.cf.evaluate_episode_step:
                # print('evaluate episode_step: ', episode_step)
                self.eval_env.real_done = True
                episodes_average.append(episode_reward)
                episode_step = 0
                episode_reward = 0
                self.eval_env.reset()

        e_a = np.array(episodes_average)
        # print('evaluate: ', 'episodes: ', e_a.size, 'average: ', e_a.mean(), 'std: ', e_a.std())
        return e_a
コード例 #24
0
class Agent:
    def __init__(self):
        self.model, self.target = DQN(), DQN()
        if USE_CUDA:
            self.model.cuda()
            self.target.cuda()

        self.exp_buffer = Memory()
        self.exp_number = 0  # size of exp buffer so far
        self.param_updates = 0  # track how many times params updated

        self.opt = torch.optim.RMSprop(self.model.parameters(),
                                       lr=LEARNING_RATE)
        self.loss = nn.SmoothL1Loss()

    # Make an action given a state
    def act(self, state, explore=True):
        if explore and np.random.rand() <= EPSILON:
            # Act randomly
            a = np.random.randint(NUM_ACTIONS)
        else:
            # Send state to model
            a_vec = self.model(state)
            a = int(torch.argmax(torch.squeeze(a_vec)))

        return a

    # clear the buffer
    def clear_exp_buffer(self):
        self.exp_buffer = Memory()
        self.exp_number = 0

    # Add experience to exp buffer
    def add_exp(self, exp):
        self.exp_buffer.add(exp)
        self.exp_number += 1

    # Replay gets batch and trains on it
    def replay(self, batch_size):
        q_loss = 0
        # If experience buffer isn't right size yet, don't do anything
        if self.exp_number < MIN_BUFFER_SIZE: return
        # Get batch from experience_buffer
        batch = self.exp_buffer.get_batch(batch_size)

        s, a, r, s_new, _ = zip(*batch)
        s_new = s_new[:-1]  # Remove last item (it is 'None')
        # First turn batch into something we can run through model
        s = torch.cat(s)
        a = torch.LongTensor(a).unsqueeze(1)
        r = torch.FloatTensor(r).unsqueeze(1)
        s_new = torch.cat(s_new)

        #print(a.shape,r.shape, s.shape, s_new.shape)
        if USE_CUDA:
            a = a.cuda()
            r = r.cuda()

        # Get q vals for s (what model outputted) from a
        # .gather gets us q value for specific action a
        pred_q_vals = self.model(s).gather(1, a)

        # Having chosen a in s,
        # What is the highest possible reward we can get from s_new?
        # We add q of performing a in s then add best q from next state
        # cat 0 to end for the terminal state
        s_new_q_vals = self.target(s_new).max(1)[0]
        zero = torch.FloatTensor(0)
        if USE_CUDA: zero = zero.cuda()

        s_new_q_vals = torch.cat((s_new_q_vals, zero))
        exp_q_vals = r + s_new_q_vals * GAMMA

        myloss = self.loss(pred_q_vals, exp_q_vals)
        self.opt.zero_grad()
        myloss.backward()
        self.opt.step()

        if WEIGHT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(
                    -1, 1)  # Weight clipping avoids exploding gradients

        if self.param_updates % TARGET_UPDATE_INTERVAL == 0:
            self.target.load_state_dict(self.model.state_dict())

        self.param_updates += 1

        global EPSILON
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY

        return myloss.item()
コード例 #25
0
ファイル: agent.py プロジェクト: selfishhari/skunkworks
class Agent:
    def __init__(self,
                 n_states,
                 n_actions,
                 n_goals,
                 action_bounds,
                 capacity,
                 env,
                 k_future,
                 batch_size,
                 action_size=1,
                 tau=0.05,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 gamma=0.98):
        self.device = device("cpu")
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_goals = n_goals
        self.k_future = k_future
        self.action_bounds = action_bounds
        self.action_size = action_size
        self.env = env

        self.actor = Actor(self.n_states,
                           n_actions=self.n_actions,
                           n_goals=self.n_goals).to(self.device)
        self.critic = Critic(self.n_states,
                             action_size=self.action_size,
                             n_goals=self.n_goals).to(self.device)
        self.sync_networks(self.actor)
        self.sync_networks(self.critic)
        self.actor_target = Actor(self.n_states,
                                  n_actions=self.n_actions,
                                  n_goals=self.n_goals).to(self.device)
        self.critic_target = Critic(self.n_states,
                                    action_size=self.action_size,
                                    n_goals=self.n_goals).to(self.device)
        self.init_target_networks()
        self.tau = tau
        self.gamma = gamma

        self.capacity = capacity
        self.memory = Memory(self.capacity, self.k_future, self.env)

        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_optim = Adam(self.actor.parameters(), self.actor_lr)
        self.critic_optim = Adam(self.critic.parameters(), self.critic_lr)

        self.state_normalizer = Normalizer(self.n_states[0],
                                           default_clip_range=5)
        self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5)

    def choose_action(self, state, goal, train_mode=True):
        #takes state and goal, concatenates it and passes it to actor network
        #actor returns action, to which random weird noises are added and returned
        state = self.state_normalizer.normalize(state)
        goal = self.goal_normalizer.normalize(goal)
        state = np.expand_dims(state, axis=0)
        goal = np.expand_dims(goal, axis=0)

        with torch.no_grad():
            x = np.concatenate([state, goal], axis=1)
            x = from_numpy(x).float().to(self.device)
            action = self.actor(x)[0].cpu().data.numpy()

        if train_mode:
            action += 0.2 * np.random.randn(self.n_actions)
            action = np.clip(action, self.action_bounds[0],
                             self.action_bounds[1])

            random_actions = np.random.uniform(low=self.action_bounds[0],
                                               high=self.action_bounds[1],
                                               size=self.n_actions)
            action += np.random.binomial(1, 0.3,
                                         1)[0] * (random_actions - action)

        return action

    def store(self, mini_batch):
        for batch in mini_batch:
            self.memory.add(batch)
        self._update_normalizer(mini_batch)

    def init_target_networks(self):
        self.hard_update_networks(self.actor, self.actor_target)
        self.hard_update_networks(self.critic, self.critic_target)

    @staticmethod
    def hard_update_networks(local_model, target_model):
        target_model.load_state_dict(local_model.state_dict())

    @staticmethod
    def soft_update_networks(local_model, target_model, tau=0.05):
        for t_params, e_params in zip(target_model.parameters(),
                                      local_model.parameters()):
            t_params.data.copy_(tau * e_params.data +
                                (1 - tau) * t_params.data)

    def train(self):
        states, actions, rewards, next_states, goals = self.memory.sample(
            self.batch_size)

        states = self.state_normalizer.normalize(states)
        next_states = self.state_normalizer.normalize(next_states)
        goals = self.goal_normalizer.normalize(goals)
        inputs = np.concatenate([states, goals], axis=1)
        next_inputs = np.concatenate([next_states, goals], axis=1)

        inputs = torch.Tensor(inputs).to(self.device)
        rewards = torch.Tensor(rewards).to(self.device)
        next_inputs = torch.Tensor(next_inputs).to(self.device)
        actions = torch.Tensor(actions).to(self.device)

        with torch.no_grad():
            #get Qmax
            target_q = self.critic_target(next_inputs,
                                          self.actor_target(next_inputs))
            #apply bellman equation on Qmax to get computed Q for actions from above(initial state, action)
            target_returns = rewards + self.gamma * target_q.detach()
            target_returns = torch.clamp(target_returns, -1 / (1 - self.gamma),
                                         0)

        #use critic to generate actual Q for (initial states and actions)
        q_eval = self.critic(inputs, actions)
        critic_loss = (target_returns - q_eval).pow(2).mean()

        a = self.actor(inputs)
        actor_loss = -self.critic(inputs, a).mean()
        actor_loss += a.pow(2).mean()

        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.sync_grads(self.actor)
        self.actor_optim.step()

        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.sync_grads(self.critic)
        self.critic_optim.step()

        return actor_loss.item(), critic_loss.item()

    def save_weights(self):
        torch.save(
            {
                "actor_state_dict": self.actor.state_dict(),
                "state_normalizer_mean": self.state_normalizer.mean,
                "state_normalizer_std": self.state_normalizer.std,
                "goal_normalizer_mean": self.goal_normalizer.mean,
                "goal_normalizer_std": self.goal_normalizer.std
            }, "NBM_FetchPickAndPlace_v2.pth")

    def load_weights(self):

        checkpoint = torch.load("NBM_FetchPickAndPlace_v2.pth")
        actor_state_dict = checkpoint["actor_state_dict"]
        self.actor.load_state_dict(actor_state_dict)
        state_normalizer_mean = checkpoint["state_normalizer_mean"]
        self.state_normalizer.mean = state_normalizer_mean
        state_normalizer_std = checkpoint["state_normalizer_std"]
        self.state_normalizer.std = state_normalizer_std
        goal_normalizer_mean = checkpoint["goal_normalizer_mean"]
        self.goal_normalizer.mean = goal_normalizer_mean
        goal_normalizer_std = checkpoint["goal_normalizer_std"]
        self.goal_normalizer.std = goal_normalizer_std

    def set_to_eval_mode(self):
        self.actor.eval()
        # self.critic.eval()

    def update_networks(self):
        self.soft_update_networks(self.actor, self.actor_target, self.tau)
        self.soft_update_networks(self.critic, self.critic_target, self.tau)

    def _update_normalizer(self, mini_batch):
        states, goals = self.memory.sample_for_normalization(mini_batch)

        self.state_normalizer.update(states)
        self.goal_normalizer.update(goals)
        self.state_normalizer.recompute_stats()
        self.goal_normalizer.recompute_stats()

    @staticmethod
    def sync_networks(network):
        comm = MPI.COMM_WORLD
        flat_params = _get_flat_params_or_grads(network, mode='params')
        comm.Bcast(flat_params, root=0)
        _set_flat_params_or_grads(network, flat_params, mode='params')

    @staticmethod
    def sync_grads(network):
        flat_grads = _get_flat_params_or_grads(network, mode='grads')
        comm = MPI.COMM_WORLD
        global_grads = np.zeros_like(flat_grads)
        comm.Allreduce(flat_grads, global_grads, op=MPI.SUM)
        _set_flat_params_or_grads(network, global_grads, mode='grads')
コード例 #26
0
ファイル: train.py プロジェクト: diglabsityler/space-invaders
    choice = random.randint(1, len(possible_actions)) - 1
    action = possible_actions[choice]
    next_state, reward, done, _ = env.step(action)

    # env.render()

    # Stack the frames
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)

    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)

        # Add experience to memory
        memory.add((state, action, reward, next_state, done))

        # Start a new episode
        state = env.reset()

        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)

    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))

        # Our new state is now the next_state
        state = next_state

コード例 #27
0
class Agent:
    def __init__(self,
                 input_dim,
                 n_actions,
                 lr=0.00025,
                 eps=1.0,
                 memory=150000):
        self.n_states = input_dim
        self.n_actions = n_actions
        self.eps = eps
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.MIN_EPS = 0.1
        self.model = self.build_model(input_dim, n_actions, lr)
        self.memory = Memory(memory)
        self.zeros = np.zeros(self.n_states)

    def build_model(self, input_dim, n_actions, lr):
        model = Sequential()
        model.add(Dense(input_dim=input_dim, units=256, activation='relu'))
        model.add(Dense(input_dim=input_dim, units=1024, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(input_dim=input_dim, units=2048, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(input_dim=input_dim, units=48, activation='relu'))
        model.add(Dense(units=n_actions, activation='linear'))
        optimizer = RMSprop(lr=lr)
        model.compile(loss='mse', optimizer=optimizer)

        return model

    def train(self, x, y, verbose=0):
        self.model.fit(x, y, verbose=verbose, batch_size=64)

    def predict(self, state):
        return self.model.predict(state)

    def predict_single(self, state):
        q_val = self.predict(state.reshape(1, self.n_states))
        if random.random() > self.eps:
            return [np.argmax(q_val.flatten()), q_val]
        else:
            return [random.randint(0, self.n_actions - 1), q_val]

    def save(self, state, next_state, action, reward):
        self.memory.add(np.array(state, dtype=np.uint8), np.array(next_state),
                        int(action), int(reward))

    def replay(self):
        batch = self.memory.sample(self.BATCH_SIZE)

        x = np.empty(0).reshape(0, self.n_states)
        y = np.empty(0).reshape(0, self.n_actions)

        none_list = [None for itr in range(0, self.n_states)]

        next_state = np.array([
            (self.zeros if np.array_equal(state, none_list) else state)
            for state in batch[:, self.n_states:2 * self.n_states]
        ])
        state = batch[:, :self.n_states]

        target_Q = self.predict(next_state)
        Q_value = self.predict(state)

        for indx, element in enumerate(batch):
            state = element[:self.n_states]
            next_state = element[self.n_states:2 * self.n_states]
            action = int(element[2 * self.n_states])
            reward = element[2 * self.n_states + 1]
            q_val = Q_value[indx]

            if np.array_equal(next_state, none_list):
                q_val[action] = reward
            else:
                q_val[action] = reward + self.GAMMA * np.amax(
                    np.array(target_Q[indx]))

            y = np.vstack([y, q_val])
            x = np.vstack([x, state])
        self.train(x, y)

    def decay(self):
        if self.eps > self.MIN_EPS:
            self.eps = self.eps * 0.99
コード例 #28
0
ファイル: agent.py プロジェクト: eiganken/Models
class Agent:
    """
	エージェントクラス

	Attributes
	----------
	brain : brain
	memory : memory
	replay_size : int
		経験再生時に取り出す経験データの数
	last_state : ndarray
	last_action : int
	episode : int
		動的に追加する. それぞれ状態,行動,エピソードの番号を保存
	"""
    def __init__(self, state_size, action_size, replay_size=32):
        """
		state_size : int
			状態空間の次元数
		action_size : int
			行動空間の次元数
		"""
        self.brain = Brain(state_size, action_size)
        self.memory = Memory()
        self.replay_size = replay_size

    def get_action(self, state, episode, optimal=False):
        """
		行動を決定する.

		Parameters
		----------
		state : list
			状態ベクトル.
		"""
        # 方策により行動する
        if np.random.rand() < 0.001 + 0.9 / (1.0 + episode):
            action = np.random.randint(self.brain.action_size)
        else:
            # Q値を取得
            q_values = self.brain.get_q_values(state)
            action = np.argmax(q_values)
        # 状態と行動を保存する
        self.last_state = state
        self.last_action = action
        self.episode = episode

        return action

    def learn(self, reward, next_state, done):
        """
		学習を行う.

		Parameters
		----------
		reward : たぶんint
			報酬
		state : array
			状態ベクトル
		done : bool
			終端状態かどうか
		"""
        # 経験をメモリーに保存する
        experience = (self.last_state, self.last_action, reward, next_state,
                      done)
        self.memory.add(experience)
        # 経験再生
        if self.memory.is_able_fit():
            experiences = self.memory.get_sample()
            self.brain.replay(self.episode, experiences, self.replay_size)
コード例 #29
0
ファイル: agent.py プロジェクト: stevenliang16/Montezuma
class Agent:

    def __init__(self, net, actionSet, goalSet, metaEpsilon=defaultMetaEpsilon, epsilon=defaultEpsilon,
                 controllerEpsilon=defaultControllerEpsilon, tau=defaultTau):
        self.actionSet = actionSet
        self.controllerEpsilon = controllerEpsilon
        self.goalSet = goalSet
        self.metaEpsilon = metaEpsilon
        self.nSamples = defaultNSample 
        self.metaNSamples = defaultMetaNSamples 
        self.gamma = defaultGamma
        self.targetTau = tau
        self.net = net
        self.memory = Memory(controllerMemCap)
        self.metaMemory = Memory(metaMemCap)

    def selectMove(self, state, goal):
        goalVec = utils.oneHot(goal)
        if self.controllerEpsilon[goal] < random.random():
            # predict action
            dummyYtrue = np.zeros((1, 8))
            dummyMask = np.zeros((1, 8))
            return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4)), np.asarray([goalVec]), dummyYtrue, dummyMask], verbose=0)[1])
        return random.choice(self.actionSet)

    def setControllerEpsilon(self, epsilonArr):
        self.controllerEpsilon = epsilonArr

    def selectGoal(self, state):
        if self.metaEpsilon < random.random():
            # predict action
            pred = self.net.metaNet.predict([np.reshape(state, (1, 84, 84, 4)), np.zeros((1,3)), np.zeros((1,3))], verbose=0)[1]
            return np.argmax(pred)
        return random.choice(self.goalSet)

    def selectTrueGoal(self, goalNum):
        return trueSubgoalOrder[goalNum]

    def setMetaEpsilon(self, epsilon):
        self.metaEpsilon = epsilon

    def criticize(self, reachGoal, action, die, distanceReward, useSparseReward):
        reward = 0.0
        if reachGoal:
            reward += 50.0
        # if die:
        #     reward -= 200.0
        if not useSparseReward:
            # if action == 0:
            #     reward -= 0.1
            reward += distanceReward
        reward = np.minimum(reward, maxReward)
        reward = np.maximum(reward, minReward)
        return reward

    def store(self, experience, meta=False):
        if meta:
            self.metaMemory.add(np.abs(experience.reward), experience)
        else:
            self.memory.add(np.abs(experience.reward), experience)

    
    def _update(self, stepCount):
        batches = self.memory.sample(self.nSamples)
        stateVector = []
        goalVector = []
        for batch in batches:
            exp = batch[1]
            stateVector.append(exp.state)
            goalVector.append(utils.oneHot(exp.goal))
        stateVector = np.asarray(stateVector)
        goalVector = np.asarray(goalVector)
        nextStateVector = []
        for batch in batches:
            exp = batch[1]
            nextStateVector.append(exp.next_state)
        nextStateVector = np.asarray(nextStateVector)
        rewardVectors = self.net.controllerNet.predict([stateVector, goalVector, np.zeros((self.nSamples,8)), np.zeros((self.nSamples, 8 ))], verbose=0)[1]
        
        rewardVectorsCopy = np.copy(rewardVectors)
        rewardVectors = np.zeros((self.nSamples, 8))
        nextStateRewardVectors = self.net.targetControllerNet.predict([nextStateVector, goalVector, np.zeros((self.nSamples,8)), np.zeros((self.nSamples, 8 ))], verbose=0)[1]
        
        maskVector = np.zeros((self.nSamples, 8))
        for i, batch in enumerate(batches):
            exp = batch[1]
            idx = batch[0]
            maskVector[i, exp.action] = 1. 
            rewardVectors[i][exp.action] = exp.reward
            if not exp.done:
                rewardVectors[i][exp.action] += self.gamma * max(nextStateRewardVectors[i])
            self.memory.update(idx, np.abs(rewardVectors[i][exp.action] - rewardVectorsCopy[i][exp.action]))
        rewardVectors = np.asarray(rewardVectors)
        loss = self.net.controllerNet.train_on_batch([stateVector, goalVector, rewardVectors, maskVector], [np.zeros(self.nSamples), rewardVectors])
        #Update target network
        controllerWeights = self.net.controllerNet.get_weights()
        controllerTargetWeights = self.net.targetControllerNet.get_weights()
        for i in range(len(controllerWeights)):
            controllerTargetWeights[i] = self.targetTau * controllerWeights[i] + (1 - self.targetTau) * controllerTargetWeights[i]
        self.net.targetControllerNet.set_weights(controllerTargetWeights)
        return loss
        

    def _update_meta(self, stepCount):
        batches = self.metaMemory.sample(self.metaNSamples)
        stateVectors = np.asarray([batch[1].state for batch in batches])
        nextStateVectors = np.asarray([batch[1].next_state for batch in batches])
        
        rewardVectors = self.net.metaNet.predict([stateVectors, np.zeros((self.nSamples,3)), np.zeros((self.nSamples, 3))], verbose=0)[1]
        rewardVectorsCopy = np.copy(rewardVectors)
        rewardVectors = np.zeros((self.metaNSamples, 3))
        nextStateRewardVectors = self.net.targetMetaNet.predict([nextStateVectors, np.zeros((self.nSamples,3)), np.zeros((self.nSamples, 3))], verbose=0)[1]
        maskVector = np.zeros((self.metaNSamples, 3))
        
        for i, batch in enumerate(batches):
            exp = batch[1]
            idx = batch[0]
            maskVector[i, exp.goal] = 1. 
            rewardVectors[i][exp.goal] = exp.reward
            if not exp.done:
                rewardVectors[i][np.argmax(exp.goal)] += self.gamma * max(nextStateRewardVectors[i])
            self.metaMemory.update(idx, np.abs(rewardVectors[i][exp.goal] - rewardVectorsCopy[i][exp.goal]))
        loss = self.net.metaNet.train_on_batch([stateVectors, rewardVectors, maskVector], [np.zeros(self.nSamples), rewardVectors])
        
        #Update target network
        metaWeights = self.net.metaNet.get_weights()
        metaTargetWeights = self.net.targetMetaNet.get_weights()
        for i in range(len(metaWeights)):
            metaTargetWeights[i] = self.targetTau * metaWeights[i] + (1 - self.targetTau) * metaTargetWeights[i]
        self.net.targetMetaNet.set_weights(metaTargetWeights)
        return loss
    def update(self, stepCount, meta=False):
        if meta:
            loss = self._update_meta(stepCount)
        else:
            loss = self._update(stepCount)
        return loss

    def annealMetaEpsilon(self, stepCount):
        self.metaEpsilon = defaultEndEpsilon + max(0, (defaultMetaEpsilon - defaultEndEpsilon) * \
            (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps)

    def annealControllerEpsilon(self, stepCount, goal):
        self.controllerEpsilon[goal] = defaultEndEpsilon + max(0, (defaultControllerEpsilon[goal] - defaultEndEpsilon) * \
            (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps)
コード例 #30
0
ファイル: main.py プロジェクト: sidkothiyal/doom_dq_learning
def main():
    global frame_size, stack_size

    state_size = list(frame_size)
    state_size.append(stack_size)

    game = Doom()
    no_actions = len(game.actions)

    learning_rate = 0.002
    no_episodes = 500
    max_steps = 100
    batch_size = 32

    explore_max = 1.
    explore_min = 0.01
    decay_rate = 0.00001

    gamma = 0.95

    pretrain_length  = batch_size
    memory_size = 1000000

    training = True

    episode_render = True

    tf.reset_default_graph()

    deep_Q_network = DeepQNetwork(state_size, no_actions, learning_rate)

    memory = Memory(max_size=memory_size)
    game.start_game()
    game.restart_episode()

    for i in range(pretrain_length):
        if i == 0:
            img, game_vars = game.get_environment_state()
            state = frame_stacking(img, True)

        action = random.choice(game.actions)

        reward = game.take_action(action)

        done = game.is_episode_finished()

        if done:
            next_state = np.zeros(state.shape)
            memory.add((state, action, reward, next_state, done))

            game.restart_episode()
            img, game_vars = game.get_environment_state()
            state = frame_stacking(img, True)

        else:
            next_img, next_game_vars = game.get_environment_state()
            next_state = frame_stacking(img, False)

            memory.add((state, action, reward, next_state, done))

            state = next_state

    writer = tf.summary.FileWriter("./tensorboard/dqn/1")

    tf.summary.scalar("Loss", deep_Q_network.loss)

    write_op = tf.summary.merge_all()

    """Prediction """

    def predict_action(curr_decay_step, curr_state):
        exp_exp_tradeoff = np.random.rand()

        curr_explore_prob = explore_min + ((explore_max - explore_min) * np.exp(-decay_rate * curr_decay_step))

        if curr_explore_prob > exp_exp_tradeoff:
            curr_action = random.choice(game.actions)

        else:
            Qs = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: curr_state.reshape((1, *curr_state.shape))})

            choice = np.argmax(Qs)
            curr_action = game.actions[choice]

        return curr_action, curr_explore_prob

    """Training Agent"""
    saver = tf.train.Saver()

    if training:
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            decay_step = 0

            game.start_game()

            for episode in range(no_episodes):
                step = 0

                episode_rewards = []

                game.restart_episode()
                img, game_vars = game.get_environment_state()

                state = frame_stacking(img, True)

                while step < max_steps:
                    step += 1

                    decay_step += 1

                    action, explore_prob = predict_action(decay_step, state)

                    reward = game.take_action(action)

                    done = game.is_episode_finished()

                    episode_rewards.append(reward)

                    if done:
                        next_img = np.zeros(frame_size, dtype=np.int)
                        next_state = frame_stacking(next_img, False)

                        step = max_steps

                        total_rewards = np.sum(episode_rewards)

                        print("Episode No. {}".format(episode),
                              "Total reward: {}".format(total_rewards),
                              "Training Loss: {:.4f}".format(loss_val),
                              "Explore Prob: {:.4f}".format(explore_prob))

                        memory.add((state, action, reward, next_state, done))

                    else:
                        next_img, next_game_vars = game.get_environment_state()
                        next_state = frame_stacking(next_img, False)

                        memory.add((state, action, reward, next_state, done))

                        state = next_state

                    """Learning Part """
                    """Get mini-batches from memory and train"""
                    batch = memory.sample(batch_size)

                    states_mb = []
                    actions_mb = []
                    rewards_mb = []
                    next_states_mb = []
                    dones_mb = []

                    for each in batch:
                        states_mb.append(each[0])
                        actions_mb.append(each[1])
                        rewards_mb.append(each[2])
                        next_states_mb.append(each[3])
                        dones_mb.append(each[4])

                    states_mb = np.array(states_mb)
                    actions_mb = np.array(actions_mb)
                    rewards_mb = np.array(rewards_mb)
                    next_states_mb = np.array(next_states_mb)
                    dones_mb = np.array(dones_mb)

                    target_Qs_batch = []

                    Qs_next_state = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: next_states_mb})

                    for i in range(0, len(batch)):
                        terminal = dones_mb[i]

                        if terminal:
                            target_Qs_batch.append(rewards_mb[i])

                        else:
                            target = rewards_mb[i] + (gamma * np.max(Qs_next_state[i]))
                            target_Qs_batch.append(target)

                    targets_mb = np.array(target_Qs_batch)

                    loss_val, _ = sess.run([deep_Q_network.loss, deep_Q_network.optimizer],
                                           feed_dict={deep_Q_network.inputs: states_mb,
                                                      deep_Q_network.target_Q: targets_mb,
                                                      deep_Q_network.actions: actions_mb})

                    summary = sess.run(write_op, feed_dict={deep_Q_network.inputs: states_mb,
                                                            deep_Q_network.target_Q: targets_mb,
                                                            deep_Q_network.actions: actions_mb})

                    writer.add_summary(summary, episode)
                    writer.flush()

                if episode % 5 == 0:
                    save_path = saver.save(sess, "./models/model.ckpt")
                    print("Model Saved")
コード例 #31
0
ファイル: ppo.py プロジェクト: ekostadinov5/HungryGeese
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None):
    print("PPO -- Training")

    env = make('hungry_geese')
    trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    memory = Memory()

    if load_model:
        agent.load_model_weights(actor_filename, critic_filename)
        agent.load_optimizer_weights(optimizer_filename)

    episode = 0
    start_episode = 0
    end_episode = 50000
    reward_threshold = None
    threshold_reached = False
    epochs = 4
    batch_size = 128
    current_frame = 0

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            current_frame += 1
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_action(state, training=True)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            memory.add(state, action, reward, next_state, float(done))

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

            if current_frame % batch_size == 0:
                for _ in range(epochs):
                    states, actions, rewards, next_states, dones = memory.get_all_samples()
                    agent.fit(states, actions, rewards, next_states, dones)
                memory.clear()
                agent.update_networks()

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if reward_threshold:
            if len(last_1000_ep_reward) == 1000:
                if np.mean(last_1000_ep_reward) >= reward_threshold:
                    print("You solved the task after" + str(episode) + "episodes")
                    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                             'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
                    threshold_reached = True
                    break

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_action(state)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                     'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
            agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5',
                             'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy')

    if threshold_reached:
        plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards)
    else:
        plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards)
    plt.title("Reward")
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
コード例 #32
0
ファイル: agent.py プロジェクト: blazer82/ai
class Agent:
	def __init__(self, env, model, epsilon=.9, min_epsilon=.1, epsilon_decay=1e-3):
		self.env = env
		self.model = model
		self.epsilon = epsilon
		self.min_epsilon = min_epsilon
		self.epsilon_decay = epsilon_decay
		self.episode = 0
		self.positiveMemory = Memory(model=self.model, episode_max_size=20)
		self.negativeMemory = Memory(model=self.model, episode_max_size=10)

	def play(self):
		terminal = False
		observation = self.env.reset()
		X = np.zeros((2,) + observation.shape)
		X[0] = observation
		X[1] = observation

		total_reward = 0
		while terminal == False and total_reward < 200:
			y = self.model.predict(X)
			action = np.argmax(y)

			observation, reward, terminal, info = self.env.executeAction(action)
			total_reward += reward

			X[0] = X[1]
			X[1] = observation

		return total_reward

	def learn(self, overfit=False, games=1, warmup=0, skip_frames=4):
		self.episode += 1.
		epsilon = max(self.min_epsilon, self.epsilon - self.episode * self.epsilon_decay)

		total_reward = 0
		qs = []
		predictions = None

		if warmup > 0:
			print "Adding %d warmup games"%(warmup)
			games += warmup

		for game in range(1, games + 1):
			print "Game %d/%d..."%(game, games)
			terminal = False
			observation = self.env.reset()
			framebuffer = np.zeros((skip_frames,) + observation.shape)
			framebuffer[-1] = observation
			frame = 0
			action = np.random.randint(0, 2)
			episode = []
			while terminal == False:
				frame += 1

				if frame%skip_frames != 0:
					observation, reward, terminal, info = self.env.executeAction(action)

				if frame%skip_frames == 0 or reward != 0 or terminal:
					X = framebuffer.copy()
					y = self.model.predict(X)
					qs.append(max(y))
					if predictions is None:
						predictions = np.zeros_like(y)
					predictions[np.argmax(y)] += 1

					if frame%skip_frames == 0:
						if np.random.rand() <= epsilon:
							action = np.random.randint(0, len(y))
						else:
							action = np.argmax(y)

						observation, reward, terminal, info = self.env.executeAction(action)

					total_reward += reward

					y[action] = 1. # encourage current action, for now
					episode.append((X, y, action, reward, terminal))

					if reward == 1:
						self.positiveMemory.add(episode, positive=True)
						episode = []
					if reward == -1:
						self.negativeMemory.add(episode, positive=False)
						episode = []

				framebuffer[0:skip_frames-1] = framebuffer[1:]
				framebuffer[-1] = observation

		print "Score %.1f"%(total_reward / games)

		X_pos, y_pos = self.positiveMemory.sample(nbr_positive=(games-warmup)*25)
		X_neg, y_neg = self.negativeMemory.sample(nbr_negative=(games-warmup)*100)

		if not X_pos is None:
			print "Sample %d positive and %d negative memories"%(len(y_pos), len(y_neg))
			X_t = np.concatenate((X_pos, X_neg))
			y_t = np.concatenate((y_pos, y_neg))
		else:
			print "Sample %d negative memories"%(len(y_neg))
			X_t = X_neg
			y_t = y_neg

		while overfit:
			loss = self.model.learn(X_t, y_t)
			print "Loss: %f"%(loss)

		loss = self.model.learn(X_t, y_t)

		return total_reward / games, loss, np.mean(qs), epsilon, predictions