Example #1
0
class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))
Example #2
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Example #3
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def saveNetwork(self):
        self.saver.save(self.sess,
                        save_location + self.env_name + 'network' + '-ddpg',
                        global_step=self.time_step)

    def action(self, state):
        action = self.actor_network.action(state)
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)
        #print "Action:", action
        return action

    def noise_action(self, state, epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        #print action.shape
        #print "Action_No_Noise:", action
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80)
        noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
        noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05)

        if random.random() <= 0.01:  # 0.1
            print("********Stochastic brake***********")
            noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)

        #print "Action_Noise:", action
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer

        if (not (math.isnan(reward))):
            self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
Example #4
0
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return
class Worker:
    """docstring for DDPG"""
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')

    def start(self, setting=0):
        self.env = RunEnv(visualize=True)
        self.setting = setting

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(
            self.sess, next_state_batch)
        q_value_batch = self.critic_network.target_q(self.sess,
                                                     next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            self.sess, selfstate_batch)
        q_gradient_batch = self.critic_network.gradients(
            self.sess, state_batch, action_batch_for_gradients)

        self.actor_network.train(self.sess, q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target(self.sess)
        self.critic_network.update_target(self.sess)

    def save_model(self, saver, episode):
        #if self.episode % 10 == 1:
        if self.name == 'worker_0':
            saver.save(self.sess,
                       self.model_path + "/model-" + str(episode) + ".ckpt")

    def noise_action(self, state, decay):
        # Select action a_t according to the current policy and exploration noise which gradually vanishes
        action = self.actor_network.action(self.sess, state)
        return action + self.exploration_noise.noise() * decay

    def action(self, state):
        action = self.actor_network.action(self.sess, state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE and self.training:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def work(self, coord, saver):
        if self.training:
            episode_count = self.sess.run(self.global_episodes)
        else:
            episode_count = 0
        wining_episode_count = 0
        total_steps = 0
        print("Starting worker_" + str(self.number))

        with self.sess.as_default(), self.sess.graph.as_default():
            #not_start_training_yet = True
            while not coord.should_stop():
                returns = []
                rewards = []
                episode_reward = 0

                if np.random.rand(
                ) < 0.9:  # change Aug20 stochastic apply noise
                    noisy = True
                    self.decay -= 1. / self.explore
                else:
                    noisy = False

                self.sess.run(self.update_local_ops_actor)
                self.sess.run(self.update_local_ops_critic)

                state = self.env.reset(difficulty=self.setting)
                #print(observation)
                s = process_frame(state)

                print "episode:", episode_count
                # Train

                for step in xrange(self.env.spec.timestep_limit):
                    state = process_frame(state)
                    if noisy:
                        action = np.clip(
                            self.noise_action(state, np.maximum(self.decay,
                                                                0)), 0.0, 1.0
                        )  # change Aug20, decay noise (no noise after ep>=self.explore)
                    else:
                        action = self.action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
                    next_state = process_frame(next_state)
                    self.perceive(state, action, reward * 100, next_state,
                                  done)
                    state = next_state
                    episode_reward += reward
                    if done:
                        break

                if episode % 5 == 0:
                    print "episode reward:", reward_episode

                # Testing:
                #if episode % 1 == 0:
                if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1:  # change Aug19
                    self.save_model(saver, episode_count)
                    total_return = 0
                    ave_reward = 0
                    for i in xrange(TEST):
                        state = self.env.reset()
                        reward_per_step = 0
                        for j in xrange(self.env.spec.timestep_limit):
                            action = self.action(
                                process_frame(state))  # direct action for test
                            state, reward, done, _ = self.env.step(action)
                            total_return += reward
                        if done:
                            break
                            reward_per_step += (reward -
                                                reward_per_step) / (j + 1)
                        ave_reward += reward_per_step

                    ave_return = total_return / TEST
                    ave_reward = ave_reward / TEST
                    returns.append(ave_return)
                    rewards.append(ave_reward)

                    print 'episode: ', episode, 'Evaluation Average Return:', ave_return, '  Evaluation Average Reward: ', ave_reward

                if self.name == 'worker_0' and self.training:
                    sess.run(self.increment)
                episode_count += 1

# All done Stop trail
# Confirm exit
            print('Done ' + self.name)
Example #6
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
        ###

    def state_initialiser(self, shape, mode='g'):
        if mode == 'z':  #Zero
            initial = np.zeros(shape=shape)
        elif mode == 'g':  #Gaussian
            # initial = stats.truncnorm.rvs(a=-0.02/0.01,b=0.02/0.01,loc=0.,scale=0.01,size=shape)
            initial = np.random.normal(loc=0.,
                                       scale=1. / float(shape[1]),
                                       size=shape)
        else:  # May do some adaptive initialiser can be built in later
            raise NotImplementedError
        return initial

    def train(self, time_step):  #,time_step):
        ###1) Get-batch data for opt
        minibatch, trace_length = self.replay_buffer.get_batch(
            self.batch_size, self.trace_length,
            time_step)  #, self.trace_length)
        try:
            state_trace_batch = np.stack(minibatch[:, :, 2].ravel()).reshape(
                self.batch_size, trace_length, self.state_dim)
            action_trace_batch = np.stack(minibatch[:, :, 3].ravel()).reshape(
                self.batch_size, trace_length, self.action_dim)

            next_state_batch = np.stack(minibatch[:, -1, 6].ravel()).reshape(
                self.batch_size, 1, self.state_dim)
            next_state_trace_batch = np.concatenate(
                [state_trace_batch, next_state_batch], axis=1)

            reward_trace_batch = np.stack(minibatch[:, :, 4].ravel()).reshape(
                self.batch_size, trace_length, 1)
            done_trace_batch = np.stack(minibatch[:, :, 7].ravel()).reshape(
                self.batch_size, trace_length, 1)

        except Exception as e:
            print(str(e))
            raise

        ###2) Painfully initialise initial memories of LSTMs: not super-efficient, but no error guaranteed from tf's None-type zero-state problem
        init_actor_hidden1_cORm_batch = self.state_initialiser(
            shape=(self.batch_size, self.actor_network.rnn_size), mode='z')
        actor_init_h_batch = (
            init_actor_hidden1_cORm_batch, init_actor_hidden1_cORm_batch
        )  #((init_hidden1_cORm_batch,init_hidden1_cORm_batch),(init_actor_hidden2_cORm_batch,init_actor_hidden2_cORm_batch))

        init_critic_hidden1_cORm_batch = self.state_initialiser(
            shape=(self.batch_size, self.critic_network.rnn_size), mode='z')
        critic_init_h_batch = (
            init_critic_hidden1_cORm_batch, init_critic_hidden1_cORm_batch
        )  #,(init_critic_hidden3_cORm_batch,init_critic_hidden3_cORm_batch))
        ###

        self.dt_list = np.zeros(shape=(15, ))
        self.dt_list[-1] = time.time()
        if trace_length <= OPT_LENGTH:
            target_actor_init_h_batch = actor_init_h_batch
            target_critic_init_h_batch = critic_init_h_batch
            pass
        else:
            ### memory stuff
            actor_init_h_batch = self.actor_network.action(
                state_trace_batch[:, :-OPT_LENGTH, :],
                actor_init_h_batch,
                mode=1)
            target_actor_init_h_batch = actor_init_h_batch
            critic_init_h_batch = self.critic_network.evaluation(
                state_trace_batch[:, :-OPT_LENGTH, :],
                action_trace_batch[:, :-OPT_LENGTH, :],
                critic_init_h_batch,
                mode=1)
            target_critic_init_h_batch = critic_init_h_batch

            state_trace_batch = state_trace_batch[:, -OPT_LENGTH:, :]
            next_state_trace_batch = next_state_trace_batch[:, -(OPT_LENGTH +
                                                                 1):, :]
            action_trace_batch = action_trace_batch[:, -OPT_LENGTH:, :]
            reward_trace_batch = reward_trace_batch[:, -OPT_LENGTH:, :]
            done_trace_batch = done_trace_batch[:, -OPT_LENGTH:, :]
        self.dt_list[0] = time.time() - np.sum(self.dt_list)

        ###3) Obtain target output
        next_action_batch = self.actor_network.target_action(
            next_state_trace_batch,
            init_temporal_hidden_cm_batch=target_actor_init_h_batch)
        self.dt_list[1] = time.time() - np.sum(self.dt_list)
        next_action_trace_batch = np.concatenate(
            [action_trace_batch,
             np.expand_dims(next_action_batch, axis=1)],
            axis=1)
        self.dt_list[2] = time.time() - np.sum(self.dt_list)
        target_lastQ_batch = self.critic_network.target_q_trace(
            next_state_trace_batch,
            next_action_trace_batch,
            init_temporal_hidden_cm_batch=target_critic_init_h_batch)
        self.dt_list[3] = time.time() - np.sum(self.dt_list)

        # Control the length of time-step for gradient
        if trace_length <= OPT_LENGTH:
            update_length = np.minimum(
                trace_length,
                OPT_LENGTH // 1)  #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2)
        else:
            update_length = OPT_LENGTH // 1  #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2)

        target_lastQ_batch_masked = target_lastQ_batch * (
            1. - done_trace_batch[:, -1])
        rQ = np.concatenate([
            np.squeeze(reward_trace_batch[:, -update_length:], axis=-1),
            target_lastQ_batch_masked
        ],
                            axis=1)
        self.dt_list[4] = time.time() - np.sum(self.dt_list)

        try:
            discounting_mat = self.discounting_mat_dict[update_length]
        except KeyError:
            discounting_mat = np.zeros(shape=(update_length,
                                              update_length + 1),
                                       dtype=np.float)
            for i in range(update_length):
                discounting_mat[i, :i] = 0.
                discounting_mat[i,
                                i:] = GAMMA**np.arange(0.,
                                                       -i + update_length + 1)
            discounting_mat = np.transpose(discounting_mat)
            self.discounting_mat_dict[update_length] = discounting_mat
        try:
            y_trace_batch = np.expand_dims(np.matmul(rQ, discounting_mat),
                                           axis=-1)
        except Exception as e:
            print('?')
            raise
        self.dt_list[5] = time.time() - np.sum(self.dt_list)

        ###4)Train Critic: get next_action, target_q, then optimise
        critic_grad = self.critic_network.train(
            y_trace_batch,
            update_length,
            state_trace_batch,
            action_trace_batch,
            init_temporal_hidden_cm_batch=critic_init_h_batch)
        self.dt_list[6] = time.time() - np.sum(self.dt_list)

        ###5) Train Actor: while updated critic, we declared the dQda. Hence sess,run(dQda*dadParam_actor), then optimise actor
        for i in range(update_length):
            actor_init_h_batch_trace = (np.expand_dims(actor_init_h_batch[0],
                                                       axis=1),
                                        np.expand_dims(actor_init_h_batch[1],
                                                       axis=1))
            critic_init_h_batch_trace = (np.expand_dims(critic_init_h_batch[0],
                                                        axis=1),
                                         np.expand_dims(critic_init_h_batch[1],
                                                        axis=1))
            if i == 0:
                actor_init_h_batch_stack = actor_init_h_batch_trace
                critic_init_h_batch_stack = critic_init_h_batch_trace
            else:
                actor_init_h_batch_stack = (np.concatenate(
                    (actor_init_h_batch_stack[0], actor_init_h_batch_trace[0]),
                    axis=1),
                                            np.concatenate(
                                                (actor_init_h_batch_stack[1],
                                                 actor_init_h_batch_trace[1]),
                                                axis=1))
                critic_init_h_batch_stack = (
                    np.concatenate((critic_init_h_batch_stack[0],
                                    critic_init_h_batch_trace[0]),
                                   axis=1),
                    np.concatenate((critic_init_h_batch_stack[1],
                                    critic_init_h_batch_trace[1]),
                                   axis=1))
            action_trace_batch_for_gradients, actor_init_h_batch = self.actor_network.action_trace(
                np.expand_dims(state_trace_batch[:, i], 1),
                init_temporal_hidden_cm_batch=actor_init_h_batch)
            critic_init_h_batch = self.critic_network.evaluation_trace(
                np.expand_dims(state_trace_batch[:, i], 1),
                np.expand_dims(action_trace_batch[:, i], 1),
                init_temporal_hidden_cm_batch=critic_init_h_batch)
            if i == 0:
                action_trace_batch_for_gradients_stack = action_trace_batch_for_gradients
            else:
                action_trace_batch_for_gradients_stack = np.concatenate(
                    (action_trace_batch_for_gradients_stack,
                     action_trace_batch_for_gradients),
                    axis=1)

        self.dt_list[7] = time.time() - np.sum(self.dt_list)
        state_trace_batch_stack = np.reshape(
            state_trace_batch,
            (self.batch_size * update_length, 1, self.state_dim))
        action_trace_batch_stack = np.reshape(
            action_trace_batch,
            (self.batch_size * update_length, 1, self.action_dim))
        action_trace_batch_for_gradients_stack = np.reshape(
            action_trace_batch_for_gradients_stack,
            (self.batch_size * update_length, 1, self.action_dim))
        actor_init_h_batch_stack = (np.reshape(
            actor_init_h_batch_stack[0],
            (self.batch_size * update_length, self.actor_network.rnn_size)),
                                    np.reshape(
                                        actor_init_h_batch_stack[1],
                                        (self.batch_size * update_length,
                                         self.actor_network.rnn_size)))
        critic_init_h_batch_stack = (np.reshape(
            critic_init_h_batch_stack[0],
            (self.batch_size * update_length, self.critic_network.rnn_size)),
                                     np.reshape(
                                         critic_init_h_batch_stack[1],
                                         (self.batch_size * update_length,
                                          self.critic_network.rnn_size)))

        q_gradient_trace_batch = self.critic_network.gradients(
            1,
            state_trace_batch_stack,
            action_trace_batch_for_gradients_stack,
            init_temporal_hidden_cm_batch=critic_init_h_batch_stack)
        self.dt_list[8] = time.time() - np.sum(self.dt_list)

        # Update the actor policy using the sampled gradient:
        actor_grad = self.actor_network.train(
            q_gradient_trace_batch,
            1,
            state_trace_batch_stack,
            action_trace_batch_stack,
            init_temporal_hidden_cm_batch=actor_init_h_batch_stack)
        self.dt_list[9] = time.time() - np.sum(self.dt_list)

        # Update the target networks via EMA & Indicators
        # self.critic_network.update_target()
        self.dt_list[10] = time.time() - np.sum(self.dt_list)
        # self.actor_network.update_target()
        self.dt_list[11] = time.time() - np.sum(self.dt_list)

        # actor_diff = self.actor_network.get_diff()
        self.dt_list[12] = time.time() - np.sum(self.dt_list)
        # critic_diff = self.critic_network.get_diff()
        self.dt_list[13] = time.time() - np.sum(self.dt_list)

        self.dt_list = np.delete(self.dt_list, -1)
        return actor_grad, critic_grad,  # actor_diff, actor_grad, critic_diff, critic_grad

    def action(self, state_trace, init_hidden_cm, epi, noisy=True):
        # Select action a_t according to the current policy and exploration noise
        action, last_hidden_cm = self.actor_network.action([state_trace],
                                                           init_hidden_cm,
                                                           mode=2)
        if noisy:
            noise = self.exploration_noise.noise()  #epi)
            return action + noise, last_hidden_cm  #, dt#, np.linalg.norm(noise)
        else:
            return action, last_hidden_cm

    def evaluation(self, state_trace, action_trace, action_last,
                   init_hidden_cm):
        return self.critic_network.evaluation([state_trace], [action_trace],
                                              action_last,
                                              init_hidden_cm,
                                              mode=2)  #q_value, last_hidden_cm

    # def perceive(self,actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,time_step,epi):
    def perceive(self, state, action, reward, next_state, done, time_step,
                 epi):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # self.replay_buffer.add(actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,epi)
        done = float(done)
        self.replay_buffer.add(state, action, reward, next_state, done, epi,
                               time_step)

        # Store transitions to replay start size then start training
        if (self.replay_buffer.num_experiences > REPLAY_START_SIZE):
            # Non-zero diff should be found
            self.actor_grad, self.critic_grad = self.train(time_step)
            # self.actor_diff, self.actor_grad, self.critic_diff, self.critic_grad = self.train(time_step)
        else:
            # Zero diff as is not trained
            # self.actor_diff = 0.
            self.actor_grad = 0.
            # self.critic_diff = 0.
            self.critic_grad = 0.

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Example #7
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # print(minibatch)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        # print(q_value_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("action_batch[0]", file=f)
            print(action_batch[0], file=f)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("q_gradient_batch[0]", file=f)
            print(q_gradient_batch[0], file=f)
        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action2(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        random_action = np.zeros(10, float)
        random_action[random.randint(0, 3)] = 1
        random_action[4] = random.uniform(-100, 100)  #DASH POWER
        random_action[5] = random.uniform(-180, 180)  #DASH DEGREES
        random_action[6] = random.uniform(-180, 180)  #TURN DEGREES
        random_action[7] = random.uniform(-180, 180)  #TACKLE DEGREES
        random_action[8] = random.uniform(0, 100)  #KICK POWER
        random_action[9] = random.uniform(-180, 180)  #KICK DEGREES
        if np.random.uniform() < EPSILON:
            return action
        else:
            return random_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Example #8
0
class DDPG(object):
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0

    def train(self):
        self.time_step = self.time_step + 1
        self.epsilon_expert -= (self.epsilon_expert_range[0] -
                                self.epsilon_expert_range[1]) / EXPLORE_COUNT
        self.epsilon_expert = max(self.epsilon_expert,
                                  self.epsilon_expert_range[1])
        self.epsilon_random -= (self.epsilon_random_range[0] -
                                self.epsilon_random_range[1]) / EXPLORE_COUNT
        self.epsilon_random = max(self.epsilon_random,
                                  self.epsilon_random_range[1])
        logger.debug(
            "step: %d, epsilon_expert: %s, epsilon_random: %s" %
            (self.time_step, self.epsilon_expert, self.epsilon_random))
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
            # if done_batch[i]:
            #     y_batch.append(reward_batch[i])
            # else :
            #     y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_cost = self.critic_network.train(y_batch, state_batch,
                                                     action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = self.exploration_noise.noise(action)
    #     noise_action = action + noise
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = np.zeros(self.action_dim)
    #     noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10)
    #     noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
    #     noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10)
    #     noise_action = action + noise
    #     logger.debug("action: %s, noise: %s" % (action, noise))
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        return action

    def opposite_action(self, state):
        logger.debug("state: %s" % (state))
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        action[0] = 1 - action[0]
        logger.debug("opposite action: %s" % (action))
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >= REPLAY_START_SIZE:
            # logger.debug("train...")
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'DDPG')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Example #9
0
File: ddpg.py Project: ivychill/ltr
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")

    def train(self):
        # my_config.logger.debug("......enter tain......")
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise(action)
        # if random.random() <= 0.5:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5])
        # else:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75])
        noise_action = action + noise
        clipped_noise_action = np.clip(noise_action, 0, 1)
        # if (self.time_step < 5):
        #     my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action))
        return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        # my_config.logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'ltr')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Example #10
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state, epsilon):
        action = self.actor_network.action(state)
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.linear_noise.noise()
        noise_t[1] = epsilon * self.angular_noise.noise()
        action = action + noise_t
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)
        #print(a_linear, a_angular)

        return [a_linear, a_angular]

    def action(self, state):
        action = self.actor_network.action(state)
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)

        return [a_linear, a_angular]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        if done:
            self.linear_noise.reset()
            self.angular_noise.reset()

        return self.time_step
Example #11
0
class ddpg:
    def __init__(self, env_name, sess, state_dim, action_dim, models_dir,
                 img_dim):
        self.name = 'DDPG'
        self.env_name = env_name
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.img_dim = img_dim
        self.models_dir = models_dir

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = sess

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.img_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.img_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.saver = tf.train.Saver()

    def train(self):
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        img_batch = np.asarray([data[5] for data in minibatch])
        next_img_batch = np.asarray([data[6] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        next_action_batch = self.actor_network.target_actions(
            next_state_batch, next_img_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch,
                                                     next_img_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])

        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_batch, img_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            state_batch, img_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients, img_batch)

        self.actor_network.train(q_gradient_batch, state_batch, img_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost

    def save_network(self, step):
        self.saver.save(self.sess,
                        self.models_dir + self.env_name + '-network-ddpg.ckpt',
                        global_step=step)

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    '''
    def action(self,state):

        action = self.actor_network.action(state)

        action[0][0] = np.clip( action[0][0], -1 , 1 )
        action[0][1] = np.clip( action[0][1], 0 , 1 )
        action[0][2] = np.clip( action[0][2], 0 , 1 )
        #print "Action:", action
        return action[0]

    def noise_action(self,state,epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        print action.shape
        print "Action_No_Noise:", action
        noise_t = np.zeros([1,self.action_dim])
        noise_t[0][0] = epsilon * self.OU.function(action[0][0],  0.0 , 0.60, 0.80)
        noise_t[0][1] = epsilon * self.OU.function(action[0][1],  0.5 , 1.00, 0.10)
        noise_t[0][2] = epsilon * self.OU.function(action[0][2], -0.1 , 1.00, 0.05)
        
        action = action+noise_t
        action[0][0] = np.clip( action[0][0], -1 , 1 )
        action[0][1] = np.clip( action[0][1], 0 , 1 )
        action[0][2] = np.clip( action[0][2], 0 , 1 )
        
        print "Action_Noise:", action
        return action[0]
    '''

    def action(self, state, img):
        action = self.actor_network.action(state, img)

        action[0] = np.clip(action[0], -1, 1)
        # action[1] = np.clip( action[1], 0 , 1 )
        # action[2] = np.clip( action[2], 0 , 1 )

        return action

    def noise_action(self, state, epsilon, img):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state, img)
        noise_t = np.zeros(self.action_dim)

        if self.time_step < 100000:
            noise_t[0] = epsilon * ornstein_uhlenbeck_process(
                action[0], 0.0, 0.60, 0.80)
            # noise_t[1] = epsilon * ornstein_uhlenbeck_process(action[1],  0.5 , 1.00, 0.10)
            # noise_t[2] = epsilon * ornstein_uhlenbeck_process(action[2], -0.1 , 1.00, 0.05)
        elif self.time_step < 200000:
            if np.random.random() < 0.1:
                noise_t[0] = 0.1 * ornstein_uhlenbeck_process(
                    action[0], 0.0, 0.60, 0.80)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        # action[1] = np.clip( action[1], 0 , 1)
        # action[2] = np.clip( action[2], 0 , 1)

        return action

    def perceive(self, state, action, reward, next_state, done, img, next_img):
        if not (math.isnan(reward)):
            self.replay_buffer.add(state, action, reward, next_state, done,
                                   img, next_img)
        self.time_step = self.time_step + 1

        # Return critic cost
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            return self.train()
        else:
            return 0
Example #12
0
class DDPG_TF:
    """docstring for DDPG"""
    def __init__(self, env,loadfilename=None,printVars=False):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)

        #print 'init complete'
        self.all_vars = tf.global_variables()
        if printVars:
            for v in self.all_vars:
                print v.name.ljust(30), v.shape
        
        self.saver = tf.train.Saver(self.all_vars)
        if loadfilename is not None:
            self.saver.restore(self.sess, loadfilename)
            #print 'restore complete'
        
    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def actions(self, states):
        actions = self.actor_network.actions_no_training(states)
        return actions
    def target_actions(self, states):
        actions = self.actor_network.target_actions(states)
        return actions

    def value(self, states):
        actions = self.actor_network.actions_no_training(states)
        values = self.critic_network.q_value(states,actions)
        return values

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Example #13
0
class DDPG:
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        batch = self.sample_batch(self.batch_size)

        state_batch = np.asarray([data[0] for data in batch])
        action_batch = np.asarray([data[1] for data in batch])
        reward_batch = np.asarray([data[2] for data in batch])
        next_state_batch = np.asarray([data[3] for data in batch])
        done_batch = np.asarray([data[4] for data in batch])

        next_action_batch = self.actor.target_actions(next_state_batch)
        q_value_batch = self.critic.target_q(next_state_batch,
                                             next_action_batch)
        y_batch = []
        for i in range(len(batch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + self.gamma * q_value_batch[i])
        y_batch = np.resize(y_batch, [self.batch_size, 1])
        # Update critic by minimizing the loss L
        self.critic.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor.actions(state_batch)
        q_gradient_batch = self.critic.gradients(state_batch,
                                                 action_batch_for_gradients)

        self.actor.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor.update_target()
        self.critic.update_target()

    def noise_action(self, state):
        action = self.actor.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        return self.actor.action(state)

    def reset_noise(self):
        self.exploration_noise.reset()

    def save_policy(self, save_path):
        self.actor.save_network(save_path)