Esempio n. 1
0
 def __init__(self, config, network, loss, optimizer):
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.memory = ReplayMemory(config['REPLAY'])
     self.policy_net = network.to(self.device)
     self.target_net = network.to(self.device)
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.target_net.eval()
     self.loss = loss
     self.optimizer = optimizer(self.policy_net.parameters(), config['lr'])
     self.steps_done = 0
     self.config = config
Esempio n. 2
0
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)
Esempio n. 3
0
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)
Esempio n. 4
0
class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. add `play` function to run tests in the simulated environment
    '''

    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    @property
    def summary_writer(self):
        return self._summary_writer

    def train(self):
        start_step = self.sess.run(self.step_op)

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        trade_rem = self.env.new_random_episode(self.history, self.replay_memory)

        for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict((self.history.history, trade_rem))
            # 2. act
            screen, reward, terminal, trade_rem = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal, trade_rem)

            if terminal:
                self.env.new_random_episode(self.history, self.replay_memory)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward
            
            if self.step >= self.learn_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_episodes)
                    self.logger.info(message)

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.sess.run(
                            fetches=self.step_assign_op,
                            feed_dict={self.step_input: self.step + 1}
                        )
                        self.save_model(self.step + 1)

                        max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                    if self.step > 180:
                        self.inject_summary({
                            'average.reward': avg_reward,
                            'average.loss': avg_loss,
                            'average.q': avg_q,
                            'episode.max reward': max_ep_reward,
                            'episode.min reward': min_ep_reward,
                            'episode.avg reward': avg_ep_reward,
                            'episode.num of episodes': num_episodes,
                            'episode.rewards': ep_rewards,
                            'episode.actions': actions,
                            'training.learning_rate': self.sess.run(
                                fetches=self.learning_rate_op,
                                feed_dict={self.learning_rate_step: self.step}
                            )
                        }, self.step)

                    num_episodes = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []
    
    def predict(self, state, test_ep=None):
        s_t = state[0]
        trade_rem_t = state[1]
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.config[NUM_ACTIONS])
        else:
            action = self.sess.run(
                fetches=self.q.action,
                feed_dict={
                    self.q.phase: 0,  
                    self.s_t: [s_t], 
                    self.trade_rem_t: [trade_rem_t],
                    self.q_conv_keep_prob: 1.0,
                    self.q_dense_keep_prob: 1.0,
                    self.q_gru_keep_prob: 1.0
                }
            )[0]

        return action

    def observe(self, screen, reward, action, terminal, trade_rem):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))
        
        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal, trade_rem)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            state_t, action, reward, state_t_plus_1, terminal = self.replay_memory.sample
            s_t, trade_rem_t = state_t[0], state_t[1]
            s_t_plus_1, trade_rem_t_plus_1 = state_t_plus_1[0], state_t_plus_1[1]
            
            q_t_plus_1 = self.sess.run(
                fetches=self.t_q.values,
                feed_dict={
                    self.t_q.phase: 0, 
                    self.t_s_t: s_t_plus_1, 
                    self.t_trade_rem_t: trade_rem_t_plus_1
                }
            )

            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)

            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss, avg_q_summary = self.sess.run([self.optimizer, self.q.values, self.loss, self.q.avg_q_summary], {
                self.q.phase: 1,
                self.target_q: target_q,
                self.action: action,
                self.s_t: s_t,
                self.trade_rem_t: trade_rem_t,
                self.q_conv_keep_prob: self.config[CONV_KEEP_PROB],
                self.q_dense_keep_prob: self.config[DENSE_KEEP_PROB],
                self.q_gru_keep_prob: self.config[GRU_KEEP_PROB],
                self.learning_rate_step: self.step
            })

            self.summary_writer.add_summary(avg_q_summary, self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.replay_memory.history_length, 
                            self.replay_memory.num_channels],
                name=HISTORICAL_PRICES
            )
            self.trade_rem_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None,],
                name=TRADE_REM
            )
            
            with tf.variable_scope(DROPOUT_KEEP_PROBS):
                self.q_conv_keep_prob = tf.placeholder(tf.float32)
                self.q_dense_keep_prob = tf.placeholder(tf.float32)
                self.q_gru_keep_prob = tf.placeholder(tf.float32)

        params.dropoutkeepprobs = DropoutKeepProbs(
                    self.q_conv_keep_prob,
                    self.q_dense_keep_prob,
                    self.q_gru_keep_prob
                )
        self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK)
        self.q.build_model((self.s_t, self.trade_rem_t))

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.replay_memory.history_length, 
                            self.replay_memory.num_channels],
                name=HISTORICAL_PRICES
            )
            self.t_trade_rem_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None,],
                name=TRADE_REM
            )

        params.dropoutkeepprobs = DropoutKeepProbs()
        self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK)
        self.t_q.build_model((self.t_s_t, self.t_trade_rem_t))

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                            tf.float32,
                            self.q.weights[name].get_shape().as_list()
                        )
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[name].assign(
                    self.q_weights_placeholders[name]
                )

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)
            
            action_one_hot = tf.one_hot(self.action, self.config[NUM_ACTIONS], 
                                            1.0, 0.0, name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot, 
                                        reduction_indices=1, name=Q_ACTED)
                                        
            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS)

            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(self.learning_rate_minimum,
                    tf.train.exponential_decay(
                        self.learning_rate,
                        self.learning_rate_step,
                        self.learning_rate_decay_step,
                        self.learning_rate_decay,
                        staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss)

        with tf.variable_scope(SUMMARY):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                'episode.max reward', 'episode.min reward', 'episode.avg reward', \
                'episode.num of episodes', 'training.learning_rate']            

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.scalar(
                        name="{}-{}".format(self.env_name, tag.replace(' ', '_')),
                        tensor=self.summary_placeholders[tag]
                    )

            histogram_summary_tags = ['episode.rewards', 'episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag)
                self.summary_ops[tag] = \
                    tf.summary.histogram(
                        tag,
                        self.summary_placeholders[tag]
                    )

        self.sess.run(tf.local_variables_initializer())
        self.sess.run(tf.global_variables_initializer())
        self._saver = tf.train.Saver(self.q.weights.values() + [self.step_op], max_to_keep=30)
        
        self.load_model()
        self.update_target_network()

        self._summary_writer = tf.summary.FileWriter(self.config[TENSORBOARD_LOG_DIR])
        self._summary_writer.add_graph(self.sess.graph)

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.sess.run(
                fetches=self.t_weights_assign_ops[name],
                feed_dict=
                {self.q_weights_placeholders[name]: self.sess.run(
                    fetches=self.q.weights[name]
                )}
            )
    
    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
            self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })
        for summary_str in summary_str_lists:
            self.summary_writer.add_summary(summary_str, self.step)
Esempio n. 5
0
class DQN:
    def __init__(self, config, network, loss, optimizer):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(config['REPLAY'])
        self.policy_net = network.to(self.device)
        self.target_net = network.to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.loss = loss
        self.optimizer = optimizer(self.policy_net.parameters(), config['lr'])
        self.steps_done = 0
        self.config = config

    def update(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def select_action(self, state):
        EPS_START, EPS_END, EPS_DECAY, n_actions = self.config[
            'EPS_START'], self.config['EPS_END'], self.config[
                'EPS_DECAY'], self.config['ACTION_SPACE']
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(n_actions)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize_model(self):
        BATCH_SIZE = self.config['BATCH_SIZE']
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        GAMMA = self.config['GAMMA']
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = self.loss(state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Esempio n. 6
0
class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. play
    '''
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    @property
    def summary_writer(self):
        return self._summary_writer

    def train(self):
        start_step = self.step_op.eval()

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        self.env.new_random_episode(self.history)

        for self.step in tqdm(range(start_step, self.max_step),
                              ncols=70,
                              initial=start_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                self.env.new_random_episode(self.history)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

            if self.step >= self.learn_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)
                    print_and_log_message(message, self.logger)

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.step_assign_op.eval(
                            {self.step_input: self.step + 1})
                        self.save_model(self.step + 1)

                        max_avg_ep_reward = max(max_avg_ep_reward,
                                                avg_ep_reward)

                    if self.step > 180:
                        self.inject_summary(
                            {
                                'average.reward':
                                avg_reward,
                                'average.loss':
                                avg_loss,
                                'average.q':
                                avg_q,
                                'episode.max reward':
                                max_ep_reward,
                                'episode.min reward':
                                min_ep_reward,
                                'episode.avg reward':
                                avg_ep_reward,
                                'episode.num of game':
                                num_game,
                                'episode.rewards':
                                ep_rewards,
                                'episode.actions':
                                actions,
                                'training.learning_rate':
                                self.learning_rate_op.eval(
                                    {self.learning_rate_step: self.step}),
                            }, self.step)

                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q.action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample(
            )

            max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1})
            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss, avg_q_summary = self.sess.run(
                [
                    self.optimizer, self.q.values, self.loss,
                    self.q.avg_q_summary
                ], {
                    self.target_q: target_q,
                    self.action: action,
                    self.s_t: s_t,
                    self.learning_rate_step: self.step,
                })

            self.summary_writer.add_summary(avg_q_summary, self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(dtype=tf.float32,
                                      shape=[
                                          None,
                                          self.replay_memory.history_length,
                                          self.replay_memory.num_channels
                                      ])
        self.q = DeepSense(params,
                           self.logger,
                           self.sess,
                           self.config,
                           name=Q_NETWORK)
        self.q.build_model(self.s_t)

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None,
                                            self.replay_memory.history_length,
                                            self.replay_memory.num_channels
                                        ])
        self.t_q = DeepSense(params,
                             self.logger,
                             self.sess,
                             self.config,
                             name=T_Q_NETWORK)
        self.t_q.build_model(self.t_s_t, train=False)

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                    tf.float32, self.q.weights[name].get_shape().as_list())
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[
                    name].assign(self.q_weights_placeholders[name])

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot,
                                    reduction_indices=1,
                                    name=Q_ACTED)

            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name=LOSS)

            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(
                    tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(
                    self.learning_rate_minimum,
                    tf.train.exponential_decay(self.learning_rate,
                                               self.learning_rate_step,
                                               self.learning_rate_decay_step,
                                               self.learning_rate_decay,
                                               staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95,
                    epsilon=0.01).minimize(self.loss)

        with tf.variable_scope(SUMMARY):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                'episode.max reward', 'episode.min reward', 'episode.avg reward', \
                'episode.num of game', 'training.learning_rate']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.scalar(
                        name="{}-{}".format(self.env_name, tag),
                        tensor=self.summary_placeholders[tag]
                    )

            histogram_summary_tags = ['episode.rewards', 'episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.histogram(
                        name=tag,
                        self.summary_placeholders[tag]
                    )

        self._summary_writer = tf.summary.FileWriter(
            config[TENSORBOARD_LOG_DIR])
        self._summary_writer.add_graph(sess.graph)

        tf.initialize_all_variables().run()
        self._saver = tf.train.Saver(self.q.weights.values + [self.step_op],
                                     max_to_keep=30)

        self.load_model()
        self.update_target_network()

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.t_weights_assign_ops[name].eval({
                self.q_weights_placeholders[name]:
                self.q.weights[name].eval()
            })

    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in tag_dict.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, self.step)
def run(comm, env, policy, policy_path, action_bound, optimizer):

    actor, actor_target, critic, critic_target = policy
    actor_opt, critic_opt = optimizer

    # set OU noise
    noise_c = OUNoise(action_dim=ACT_SIZE, action_bound=action_bound)
    noise_c.reset()

    buff = []
    global_update = 0
    global_step = 0

    replay_memory = ReplayMemory(REPLAY_SIZE, SEED)

    #world reset
    if env.index == 0:
        env.reset_world()

    for id in range(MAX_EPISODES):

        #reset
        env.reset_pose()

        terminal = False
        ep_reward = 0
        step = 1

        # generate goal
        env.generate_goal_point()

        # get_state
        obs = env.get_laser_observation()
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        # episode 1
        while not terminal and not rospy.is_shutdown():

            state_list = comm.gather(state, root=0)

            ## get_action
            #-------------------------------------------------------------------------
            # generate actions at rank==0
            mean, action = generate_action(env=env,
                                           state_list=state_list,
                                           actor=actor,
                                           action_bound=action_bound)

            ## exploration
            #--------------------------------------------------------------------------
            a = noise_c.get_action(mean, step)
            #noise = np.random.normal(0, exploration_noise, size=2) #action size check
            #a = mean + noise
            #a = a.clip(action_bound[0], action_bound[1])

            # execute actions
            #-------------------------------------------------------------------------
            real_action = comm.scatter(a, root=0)

            ## run action
            #-------------------------------------------------------------------------
            env.control_vel(real_action)

            # rate.sleep()
            rospy.sleep(0.001)

            ## get reward
            #-------------------------------------------------------------------------
            # get informtion
            r, terminal, result = env.get_reward_and_terminate(step)

            ep_reward += r
            global_step += 1

            #-------------------------------------------------------------------------
            # get next state
            #-------------------------------------------------------------------------

            s_next = env.get_laser_observation()
            left = obs_stack.popleft()
            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]

            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)
            state_next_list = comm.gather(state_next, root=0)

            #-------------------------------------------------------------------------
            ## training
            #-------------------------------------------------------------------------
            if env.index == 0:

                ## save data in memory
                #-------------------------------------------------------------------------
                replay_memory.push(state[0], state[1], state[2], a, r_list,
                                   state_next[0], state_next[1], state_next[2],
                                   terminal_list)
                policy_list = [actor, actor_target, critic, critic_target]
                optimizer_list = [actor_opt, critic_opt]

                if len(replay_memory) > BATCH_SIZE:

                    ## Update step by step
                    #-------------------------------------------------------------------------
                    ddpg_update_stage(policy=policy_list,
                                      optimizer=optimizer_list,
                                      batch_size=BATCH_SIZE,
                                      memory=replay_memory,
                                      epoch=EPOCH,
                                      replay_size=REPLAY_SIZE,
                                      gamma=GAMMA,
                                      num_step=BATCH_SIZE,
                                      num_env=NUM_ENV,
                                      frames=LASER_HIST,
                                      obs_size=OBS_SIZE,
                                      act_size=ACT_SIZE,
                                      tau=TAU)
                    global_update += 1

            step += 1
            state = state_next

        ## save policy and log
        #-------------------------------------------------------------------------
        if env.index == 0:
            if global_update != 0 and global_update % 3000 == 0:
                torch.save(actor.state_dict(),
                           policy_path + '/actor_{}'.format(global_update))
                torch.save(critic.state_dict(),
                           policy_path + '/critic_{}'.format(global_update))

                logger.info(
                    '########################## model saved when update {} times#########'
                    '################'.format(global_update))
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 +
                           (env.goal_point[1] - env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        logger_cal.info(ep_reward)
Esempio n. 8
0
def run(comm, env, policy, critic, critic_opt, critic_target, policy_path,
        action_bound, optimizer):

    buff = []
    global_update = 0
    global_step = 0

    # world reset
    if env.index == 0:
        env.reset_world()

    memory_position = 0
    update = 0

    # replay_memory
    replay_memory = ReplayMemory(REPLAY_SIZE, SEED)

    for id in range(MAX_EPISODES):

        # reset
        env.reset_pose()

        terminal = False
        ep_reward = 0
        step = 1

        # generate goal
        env.generate_goal_point()

        # get_state
        obs = env.get_laser_observation()
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        # episode 1
        while not terminal and not rospy.is_shutdown():

            state_list = comm.gather(state, root=0)

            ## get_action
            #-------------------------------------------------------------------------
            # generate actions at rank==0
            a, logprob, scaled_action = generate_action(
                env=env,
                state_list=state_list,
                policy=policy,
                action_bound=action_bound)

            # execute actions
            #-------------------------------------------------------------------------
            real_action = comm.scatter(scaled_action, root=0)

            ## run action
            #-------------------------------------------------------------------------
            env.control_vel(real_action)

            rospy.sleep(0.001)

            ## get reward
            #-------------------------------------------------------------------------
            # get informtion
            r, terminal, result = env.get_reward_and_terminate(step)
            ep_reward += r
            global_step += 1

            #-------------------------------------------------------------------------

            # get next state
            #-------------------------------------------------------------------------
            s_next = env.get_laser_observation()
            left = obs_stack.popleft()

            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]

            #-------------------------------------------------------------------------

            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)
            state_next_list = comm.gather(state_next, root=0)

            ## training
            #-------------------------------------------------------------------------
            if env.index == 0:

                # add data in replay_memory
                #-------------------------------------------------------------------------
                replay_memory.push(state[0], state[1], state[2], a, logprob,
                                   r_list, state_next[0], state_next[1],
                                   state_next[2], terminal_list)
                if len(replay_memory) > BATCH_SIZE:

                    ## update
                    #-------------------------------------------------------------------------
                    update = sac_update_stage(policy=policy,
                                              optimizer=optimizer,
                                              critic=critic,
                                              critic_opt=critic_opt,
                                              critic_target=critic_target,
                                              batch_size=BATCH_SIZE,
                                              memory=replay_memory,
                                              epoch=EPOCH,
                                              replay_size=REPLAY_SIZE,
                                              tau=TAU,
                                              alpha=ALPHA,
                                              gamma=GAMMA,
                                              updates=update,
                                              update_interval=UPDATE_INTERVAL,
                                              num_step=BATCH_SIZE,
                                              num_env=NUM_ENV,
                                              frames=LASER_HIST,
                                              obs_size=OBS_SIZE,
                                              act_size=ACT_SIZE)

                    buff = []
                    global_update += 1
                    update = update

            step += 1
            state = state_next

        ## save policy
        #--------------------------------------------------------------------------------------------------------------
        if env.index == 0:
            if global_update != 0 and global_update % 1000 == 0:
                torch.save(policy.state_dict(),
                           policy_path + '/policy_{}'.format(global_update))
                torch.save(critic.state_dict(),
                           policy_path + '/critic_{}'.format(global_update))
                logger.info(
                    '########################## model saved when update {} times#########'
                    '################'.format(global_update))
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 +
                           (env.goal_point[1] - env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        logger_cal.info(ep_reward)
def run(comm, env, agent, policy_path, args):

    # Training Loop
    test_interval = 10
    save_interval = 100

    total_numsteps = 0
    updates = 0

    # world reset
    if env.index == 0: # step
        env.reset_gazebo_simulation()
        #Tesnorboard
        writer = SummaryWriter('test_runs/' + policy_path)
        

    # replay_memory     
    memory = ReplayMemory(args.replay_size, args.seed)

    

    for i_episode in range(args.num_steps):

        episode_reward = 0
        episode_steps = 0
        done = False 

        # Env reset
        env.set_gazebo_pose()
        # generate goal
        env.generate_goal_point_gazebo()    
        
        # Get initial state
        frame = env.get_laser_observation()
        frame_stack = deque([frame, frame, frame])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [frame_stack, goal, speed]

        # Episode start
        while not done and not rospy.is_shutdown():    
            state_list = comm.gather(state, root=0)

            if env.index == 0:
                action = agent.select_action(state_list)
            else:
                action = None
            
            if env.index == 0:
                if len(memory) > args.batch_size:
                    # Number of updates per step in environment
                    for i in range(args.updates_per_step):
                        # Update parameters of all the networks
                        critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)

                        writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                        writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                        writer.add_scalar('loss/policy', policy_loss, updates)
                        writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                        writer.add_scalar('entropy_temprature/alpha', alpha, updates)
                        updates += 1
                 
            # Execute actions
            #-------------------------------------------------------------------------            
            action_clip_bound = [[0, -1], [1, 1]] #### Action maximum, minimum values
            cliped_action = np.clip(action, a_min=action_clip_bound[0], a_max=action_clip_bound[1])
            real_action = comm.scatter(cliped_action, root=0)    
            
            #if real_action[0] >= 0.02 and real_action[0] <= 0.2:
            #    real_action[0] = real_action[0] / 0.6
            if real_action[0] < 0.10 and real_action[0] > 0:
                real_action[0] = 0.1
                

            if real_action[1] > 0 and real_action[1] < 0.15:
                real_action[1] = 0.0

            elif real_action[1] < 0 and real_action[1] > -0.15:
                real_action[1] = 0.0
            
            env.control_vel(real_action)

            #rospy.sleep(0.001)

            ## Get reward and terminal state
            reward, done, result = env.get_reward_and_terminate(episode_steps)
            #print("Action : [{}, {}], Distance : {}, Reward : {}".format(real_action[0], real_action[1], env.distance, reward))

            episode_steps += 1
            total_numsteps += 1
            episode_reward += reward

            # Get next state
            next_frame = env.get_laser_observation()
            left = frame_stack.popleft()
            
            frame_stack.append(next_frame)
            next_goal = np.asarray(env.get_local_goal())
            next_speed = np.asarray(env.get_self_speed())
            next_state = [frame_stack, next_goal, next_speed]

            r_list = comm.gather(reward, root=0)
            done_list = comm.gather(done, root=0)
            next_state_list = comm.gather(next_state, root=0)

            if env.index == 0:
                #meomry.list_push(state_list, action, r_list, next_state_list, done_list)
                for i in range(np.asarray(state_list).shape[0]):
                    memory.push(state_list[i][0], state_list[i][1], state_list[i][2], action[i], r_list[i], next_state_list[i][0], next_state_list[i][1], next_state_list[i][2], done_list[i]) # Append transition to memory

            state = next_state  
        

        #if total_numsteps > args.num_steps:
        #    break
        
        if env.index == 0:
            writer.add_scalar('reward/train', episode_reward, i_episode)


        if env.index == 0:
            #if global_update != 0 and global_update % 20 == 0:
            if i_episode != 0 and i_episode % save_interval == 0:
    
                torch.save(agent.policy.state_dict(), policy_path + '/policy_epi_{}'.format(i_episode))
                print('########################## policy model saved when update {} times#########'
                            '################'.format(i_episode))
                torch.save(agent.critic_1.state_dict(), policy_path + '/critic_1_epi_{}'.format(i_episode))
                print('########################## critic model saved when update {} times#########'
                            '################'.format(i_episode))
                torch.save(agent.critic_2.state_dict(), policy_path + '/critic_2_epi_{}'.format(i_episode))
                print('########################## critic model saved when update {} times#########'
                            '################'.format(i_episode))

        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2)
        print("Env: {}, memory_size: {}, Goal: ({} , {}), Episode: {}, steps: {}, Reward: {}, Distance: {}, {}".format(env.index, len(memory), round(env.goal_point[0],2), round(env.goal_point[1],2), i_episode+1, episode_steps, round(episode_reward, 2), round(distance, 2), result))
Esempio n. 10
0
class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. add summary ops
        2. timing and logging
        3. model saving
        4. increment self.step
    '''
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    def train(self):
        start_step = self.step_op.eval()

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        self.env.new_random_episode(self.history)

        for self.step in range(start_step, self.max_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                self.env.new_random_episode(self.history)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q.action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample(
            )

            max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1})
            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss = self.sess.run(
                [self.optimizer, self.q.values, self.loss], {
                    self.target_q: target_q,
                    self.action: action,
                    self.s_t: s_t,
                    self.learning_rate_step: self.step,
                })

            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(dtype=tf.float32,
                                      shape=[
                                          None,
                                          self.replay_memory.history_length,
                                          self.replay_memory.num_channels
                                      ])
        self.q = DeepSense(params,
                           self.logger,
                           self.sess,
                           self.config,
                           name=Q_NETWORK)
        self.q.build_model(self.s_t)

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None,
                                            self.replay_memory.history_length,
                                            self.replay_memory.num_channels
                                        ])
        self.t_q = DeepSense(params,
                             self.logger,
                             self.sess,
                             self.config,
                             name=T_Q_NETWORK)
        self.t_q.build_model(self.t_s_t, train=False)

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                    tf.float32, self.q.weights[name].get_shape().as_list())
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[
                    name].assign(self.q_weights_placeholders[name])

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot,
                                    reduction_indices=1,
                                    name=Q_ACTED)

            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name=LOSS)
            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(
                    tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(
                    self.learning_rate_minimum,
                    tf.train.exponential_decay(self.learning_rate,
                                               self.learning_rate_step,
                                               self.learning_rate_decay_step,
                                               self.learning_rate_decay,
                                               staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95,
                    epsilon=0.01).minimize(self.loss)

        # tf.initialize_all_variables().run()
        #initialize the q network and the target network with the same weights
        # self.update_target_network()

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.t_weights_assign_ops[name].eval({
                self.q_weights_placeholders[name]:
                self.q.weights[name].eval()
            })