Exemple #1
0
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.shape[0]
    max_action = env.action_space.high[0]
    state_dim = env.observation_space.shape[0]

    ddpg = DDPG(args, action_dim, max_action, state_dim, device)
    trained_actor = torch.load(args['model_directory'])
    ddpg.actor.load_state_dict(trained_actor)

    timestep = 0
    for episode in range(args['max_episode']):
        episode_reward = 0
        state = env.reset()

        while True:
            action = ddpg.get_action(state)
            next_state, reward, done, info = env.step(action)
            env.render()
            episode_reward += reward
            state = next_state
            timestep += 1

            if done:
                print('episode: ', episode,
                      '   reward : %.3f' % (episode_reward), '    timestep :',
                      timestep)
                break
def main():

    # Initialize the ANNs
    agent = DDPG()

    rospy.init_node("neuro_deep_planner", anonymous=False)

    ros_handler = ROSHandler()
    ros_handler.on_policy = False

    while not rospy.is_shutdown():

        # If we have a new msg we might have to execute an action and need to put the new experience in the buffer
        if ros_handler.new_msg():
            if not ros_handler.is_episode_finished:
                # Send back the action to execute
                ros_handler.publish_action(agent.get_action(ros_handler.state))

            # Safe the past state and action + the reward and new state into the replay buffer
            agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished)

        elif ros_handler.new_setting():

            agent.noise_flag = ros_handler.noise_flag

        else:
            # Train the network!
            agent.train()
Exemple #3
0
def main():
    with tf.Session() as sess:
        while True:
            try:
                env = CarlaEnv()
                break
            except Exception as e:
                print(e)

        agent = Agent(sess=sess,
                      state_size=env.observation_space.shape[0],
                      action_size=env.action_space.shape[0])
        max_episodes = 1000
        max_steps = 1800

        for i in range(int(max_episodes)):

            state = env.reset()
            print(state.shape)
            ep_reward = 0
            ep_ave_max_q = 0
            # plt.clf()
            # if i:
            #     with open("ddpg_memory.pkl","wb") as hand:
            #         pickle.dump(replay_buffer,hand)
            #     actor.save_model()
            #     critic.save_model()
            #     print("Agent saved")

            for j in range(int(max_steps)):

                print("epoch: {}, step: {}".format(i, j))
                # env.render()

                # Added exploration noise
                # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
                action = agent.get_action(state)
                # a = controller(s[0],s[1],s[3])
                # a = [a]
                next_state, reward, done, info = env.step(action)
                print("reward: {}".format(reward))

                agent.remember(state, action, reward, done, next_state)

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                agent.train()
Exemple #4
0
def main():

    agent = DDPG()

    print "========================== finish ddpg init"
    rospy.init_node("robot_follower", anonymous=True)

    ros_handler = RosHandler()

    print "========================== start ros_handler"
    while not rospy.is_shutdown():
        if ros_handler.new_msg():
            if not ros_handler.end_of_episode:
                action = agent.get_action(ros_handler.state,
                                          ros_handler.old_action)
                ros_handler.publish_action(action)
            agent.set_experience(ros_handler.state, ros_handler.reward,
                                 ros_handler.end_of_episode)
        else:
            agent.train()
Exemple #5
0
def main():

    # Initialize the ANNs
    agent = DDPG()

    rospy.init_node("neuro_deep_planner", anonymous=False)

    ros_handler = ROSHandler()
    ros_handler.on_policy = False

    # For plotting
    currently_plotting = False
    goal_count = 0
    crash_count = 0
    start_time = 0

    # Make sure the directory for the plotting exists
    if not tf.gfile.Exists(PLOT_PATH):
        tf.gfile.MakeDirs(PLOT_PATH)
    f = open(PLOT_PATH + '/results', 'w')

    while not rospy.is_shutdown():

        # If we are plotting results we don't want to train and need to turn of noise!
        if PLOTTING and not currently_plotting and agent.training_step > 0 and \
                                agent.training_step % PLOT_INTERVALL == 0:
            currently_plotting = True
            agent.noise_flag = False
            start_time = rospy.get_time()

        if currently_plotting and rospy.get_time() - start_time > PLOT_TIME:
            # Plot the results
            string = str(agent.training_step) + ', ' + str(
                goal_count) + ', ' + str(crash_count) + '\n'
            f.write(string)

            # Reset all parameters
            currently_plotting = False
            agent.noise_flag = True
            goal_count = 0
            crash_count = 0

        # If we are plotting results we need to count reached goals and crashes
        if currently_plotting:
            # Count the positive and negative rewards
            if ros_handler.new_msg():
                if not ros_handler.is_episode_finished:
                    # Send back the action to execute
                    ros_handler.publish_action(
                        agent.get_action(ros_handler.state))
                elif ros_handler.reward == 1:
                    goal_count += 1
                elif ros_handler.reward == -1:
                    crash_count += 1

        # If we're not plotting results
        else:
            # If we have a new msg we might have to execute an action and need to put the new experience in the buffer
            if ros_handler.new_msg():
                if not ros_handler.is_episode_finished:
                    # Send back the action to execute
                    ros_handler.publish_action(
                        agent.get_action(ros_handler.state))

                # Safe the past state and action + the reward and new state into the replay buffer
                agent.set_experience(ros_handler.state, ros_handler.reward,
                                     ros_handler.is_episode_finished)

            elif ros_handler.new_setting():

                agent.noise_flag = ros_handler.noise_flag

            else:
                # Train the network!
                agent.train()
Exemple #6
0
with tf.device('/{}:0'.format('CPU')):
  agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=ACTION_RANGE)
sess.run(tf.initialize_all_variables())

for i in range(NUM_EPISODES):
    cur_state = env.reset()
    cum_reward = 0
    # tensorboard summary
    summary_writer = tf.summary.FileWriter('/tmp/pendulum-log-0'+'/train', graph=tf.get_default_graph())

    if (i % EVALUATE_EVERY) == 0:
      print ('====evaluation====')
    for t in range(MAX_STEPS):
      if (i % EVALUATE_EVERY) == 0:
        env.render()
        action = agent.get_action(cur_state, sess)[0]
      else:
        # decaying noise
        action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0]
      next_state, reward, done, info = env.step(action)
      if (i % EVALUATE_EVERY) == 0:
          print('cur_state: ', end='')
          print(cur_state)
          print('action: ', end='')
          print(action)
          print('reward: ' + str(reward))
          print('------------------------------------------------------')
      if done:
        cum_reward += reward
        agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
        print("Done! Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))
Exemple #7
0
class Environment:
    """ Train & simulate wrapper for Atari-DQN
    Args:
        params: dictionary of parameters
            memory_size : size of replay memory. 100000 needs almost 25GB memory, recommend reduce it if you need
            exploration_step : pure exploration step
            gamma : discount rate
            tau: parameter for soft update
            lr_actor: learning rate for actor network
            lr_critic: learning rate for critic network
        device_name : name of device(normally cpu:0 or gpu:0)
    """
    def __init__(self, params, device_name):
        self.env = gym.make('Pendulum-v0')
        self.ddpg = DDPG(input_dim=self.env.observation_space.shape[0],
                         action_dim=self.env.action_space.shape[0],
                         action_scale=(self.env.action_space.low[0],
                                       self.env.action_space.high[0]),
                         memory_size=params["memory_size"],
                         gamma=params["gamma"],
                         tau=params["tau"],
                         learning_rate_actor=params["lr_actor"],
                         learning_rate_critic=params["lr_critic"],
                         device_name=device_name)
        self.ddpg.build()
        self.ddpg.summary()

        self.random_process = OUNoise(size=self.env.action_space.shape[0])

        # total step operated
        self.i_step = 0

    def load(self, global_step="latest"):
        """ Load saved weights for ddpg
        Args:
            global_step : load specific step, if "latest" load latest one
        """
        self.ddpg.load(global_step)

    def save(self):
        """ Save current weight of ddpg layers
        """
        self.ddpg.save()

    def train(self,
              episode,
              max_step,
              minibatch_size,
              render=False,
              verbose=1,
              val_epi=5,
              saving=False):
        """run the game with training network
        Args:
            episode : number of train episodes
            max_step : maximum step for each episode
            minibatch_size : minibatch size for replay memory training
            render : whether to show game simulating graphic
            verbose : for which step it will print the loss and accuracy (and saving)
            val_epi : number of episode for validation
            saving: whether to save checkpoint or not
        """
        losses = []
        episode_return = []
        verbose_return = []
        episode_return_val = []

        tr = trange(episode, desc="")
        for i_episode in tr:
            return_episode = 0
            observation = self.env.reset()
            self.random_process.reset()

            for t in range(max_step):
                self.i_step += 1
                if render:
                    self.env.render()

                X = observation.astype(np.float32)
                action_policy = self.ddpg.get_action(tf.convert_to_tensor(X))
                action_policy += self.random_process.sample()
                action_policy = np.clip(action_policy,
                                        self.env.action_space.low[0],
                                        self.env.action_space.high[0])
                observation, reward, done, info = self.env.step(action_policy)
                return_episode += reward

                X_next = observation.astype(np.float32)
                self.ddpg.replay_memory.append(
                    (X, action_policy, reward, X_next, done))
                # training step
                if len(self.ddpg.replay_memory) > minibatch_size:
                    X_batch, action_batch, reward_batch, X_next_batch, done_batch = self.ddpg.replay_memory.get_batch(
                        minibatch_size)
                    loss_critic, loss_actor = self.ddpg.train(
                        X_batch, action_batch, reward_batch, X_next_batch,
                        done_batch)
                    losses.append((loss_critic, loss_actor))

                if done:
                    break

            episode_return.append(return_episode)
            verbose_return.append(return_episode)
            tr.set_description("%.4f" %
                               (sum(episode_return) / len(episode_return)))

            if i_episode == 0 or ((i_episode + 1) % verbose == 0):
                if len(self.ddpg.replay_memory) <= minibatch_size:
                    stage_tooltip = "EXPLORATION"
                    print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" %
                          (i_episode + 1, self.i_step, stage_tooltip))
                    print(Fore.GREEN + "Learned Step : %4d" %
                          (self.ddpg.global_step))
                    print(Fore.BLUE + "AVG   Return         : %.4f" %
                          (sum(verbose_return) / len(verbose_return)))
                    print(Fore.BLUE + "MAX   Return         : %.4f" %
                          (max(verbose_return)))
                    continue
                else:
                    stage_tooltip = "TRAINING"
                losses_critic = [l[0] for l in losses]
                losses_actor = [l[1] for l in losses]

                # validation
                returns = []
                for epi_val in range(val_epi):
                    return_episode_val = 0
                    observation = self.env.reset()

                    for t in range(max_step):
                        if render:
                            self.env.render()

                        action_policy = self.ddpg.get_action(
                            tf.convert_to_tensor(observation.astype(
                                np.float32)))
                        observation, reward, done, info = self.env.step(
                            action_policy)
                        return_episode_val += reward

                        if done:
                            # print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode))
                            returns.append(return_episode_val)
                            break

                print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" %
                      (i_episode + 1, self.i_step, stage_tooltip))
                print(Fore.GREEN + "Learned Step : %4d" %
                      (self.ddpg.global_step))
                print(Fore.BLUE + "AVG   Return         : %.4f" %
                      (sum(verbose_return) / len(verbose_return)))
                print(Fore.BLUE + "MAX   Return         : %.4f" %
                      (max(verbose_return)))
                print(Fore.LIGHTYELLOW_EX + "AVG   LOSS Actor     :  %.4f" %
                      (sum(losses_actor) / len(losses_actor)))
                print(Fore.LIGHTYELLOW_EX + "AVG   LOSS Critic    :  %.4f" %
                      (sum(losses_critic) / len(losses_critic)))
                print(Fore.LIGHTRED_EX + "AVG VAL[%2d]   Return : %.4f" %
                      (val_epi, sum(returns) / len(returns)))
                print(Fore.LIGHTRED_EX + "MAX VAL[%2d]   Return : %.4f" %
                      (val_epi, max(returns)))
                verbose_return = []
                losses = []
                episode_return_val.append(sum(returns) / len(returns))

                if saving:
                    self.save()

                time.sleep(1)

        return episode_return

    def simulate(self, episode, max_step=1000, render=False):
        """Run the game with existing dqn network
        Args:
            episode : number of train episodes
            max_step : maximum step for each episode
            render : whether to show game simulating graphic
        """
        returns = []
        for i_episode in range(episode):
            return_episode = 0
            observation = self.env.reset()

            for t in range(max_step):
                if render:
                    self.env.render()

                action_policy = self.ddpg.get_action(
                    tf.convert_to_tensor(observation.astype(np.float32)))
                observation, reward, done, info = self.env.step(action_policy)
                return_episode += reward

                if done:
                    print(Fore.GREEN + "EPISODE %3d: REWARD: %s" %
                          (i_episode, return_episode))
                    returns.append(return_episode)
                    break

        print(Fore.RED + "AVG REWARD : %s" % (sum(returns) / len(returns)))
        print(Fore.BLUE + "MAX REWARD : %s" % (max(returns)))
def main():

    # Initialize the ANNs
    agent = DDPG()

    rospy.init_node("neuro_deep_planner", anonymous=False)

    ros_handler = ROSHandler()
    ros_handler.on_policy = False

    # For plotting
    currently_plotting = False
    goal_count = 0
    crash_count = 0
    start_time = 0

    # Make sure the directory for the plotting exists
    if not tf.gfile.Exists(PLOT_PATH):
        tf.gfile.MakeDirs(PLOT_PATH)
    f = open(PLOT_PATH + '/results', 'w')

    while not rospy.is_shutdown():

        # If we are plotting results we don't want to train and need to turn of noise!
        if PLOTTING and not currently_plotting and agent.training_step > 0 and \
                                agent.training_step % PLOT_INTERVALL == 0:
            currently_plotting = True
            agent.noise_flag = False
            start_time = rospy.get_time()

        if currently_plotting and rospy.get_time() - start_time > PLOT_TIME:
            # Plot the results
            string = str(agent.training_step) + ', ' + str(goal_count) + ', ' + str(crash_count) + '\n'
            f.write(string)

            # Reset all parameters
            currently_plotting = False
            agent.noise_flag = True
            goal_count = 0
            crash_count = 0

        # If we are plotting results we need to count reached goals and crashes
        if currently_plotting:
            # Count the positive and negative rewards
            if ros_handler.new_msg():
                if not ros_handler.is_episode_finished:
                    # Send back the action to execute
                    ros_handler.publish_action(agent.get_action(ros_handler.state))
                elif ros_handler.reward == 1:
                    goal_count += 1
                elif ros_handler.reward == -1:
                    crash_count += 1

        # If we're not plotting results
        else:
            # If we have a new msg we might have to execute an action and need to put the new experience in the buffer
            if ros_handler.new_msg():
                if not ros_handler.is_episode_finished:
                    # Send back the action to execute
                    ros_handler.publish_action(agent.get_action(ros_handler.state))

                # Safe the past state and action + the reward and new state into the replay buffer
                agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished)

            elif ros_handler.new_setting():

                agent.noise_flag = ros_handler.noise_flag

            else:
                # Train the network!
                agent.train()
Exemple #9
0
import tensorflow as tf
import gym
from ddpg import DDPG as Agent

env = gym.make("MountainCarContinuous-v0")
keel = Agent(env, 100, 10000, 0.99, (100, 100, 0.9), (100, 100, 0.9))

while (True):
    starting_state = env.reset()
    action = keel.get_action(starting_state)
    next_state, reward, done = env.observation(action)
    keel.experience(starting_state, action, reward, done, next_state)
    keel.train()