コード例 #1
0
ファイル: ppo.py プロジェクト: wwchung91/nao_rl
 def create_workers(self):
     """
     Initialize environments
     """
     self.workers = []
     for i in range(self.n_workers):
         env = nao_rl.make(self.env_name, 19998 - i, headless=True)
         worker = Worker(env, self, i)
         worker.env.agent.connect(worker.env,
                                  worker.env.active_joints)  ### IMPROVE
         self.workers.append(worker)
     self.time = time.time()
コード例 #2
0
 def create_workers(self):
     """
     Initialize environments
     """
     for i in range(self.n_workers):
         print "\nCreating worker #{}...".format(i + 1)
         try:
             env = nao_rl.make(self.env_name, headless=self.render[i])
         except:
             env = gym.make(self.env_name)
         worker = Worker(env, 'Worker_{}'.format(i + 1), self)
         self.workers.append(worker)
コード例 #3
0
        """
        Run the test simulation without any learning algorithm for debugging purposes
        """

        t = 0
        while t < 30:
            self.done = False
            self.reset()
            fps = 30.
            while not self.done:
                # raw_input("Press Enter to continue...")
                action = self.action_space.sample()
                print(action)
                state, reward, self.done, _ = self.step(action)
                print('Current state:\n angles: {}'.format(state))
                print('Reward: {}'.format(reward))
                time.sleep(1 / fps)

            t += 1


if __name__ == "__main__":
    """
    If called as a script this will initialize the scene in an open vrep instance
    """

    # Environment and objects
    import nao_rl
    env = nao_rl.make('NaoBalancing', headless=False)
    env.run()
    nao_rl.destroy_instances()
コード例 #4
0
ファイル: test.py プロジェクト: wwchung91/nao_rl
        log.load_from_file(nao_rl.settings.DATA + filename.split('.')[0] +
                           '.log')
        log.summary()
    except:
        print "Could not load the log file from '/data' directory"

    # Balancing
    # name = 'NaoBalancing_a3c_2019-01-11_10:52:46.cpkt'
    # name = 'NaoBalancing_a3c_2019-01-11_11:50:59.cpkt'
    # # Walking
    # name = 'walking.cpkt'
    # Tracking
    # name = 'NaoTracking_a3c_2019-01-11_12:01:41.cpkt'

    # Create environment
    env = nao_rl.make(env_name, headless=False)
    fps = 30.
    # Test Loop
    n = 0
    while n < n_attempts:
        total_reward = 0
        steps = 0
        done = False
        state = env.reset()
        # Test loop
        while not done:
            raw_input('ENTER TO CONTINUE...')
            action = np.clip(model.action(state), env.action_space.low,
                             env.action_space.high)
            # action = env.f()
            state, reward, done, _ = env.step(np.array(action))
コード例 #5
0
    def __init__(self,
                 env_name,
                 render,
                 plot,
                 n_workers=1,
                 max_episodes=10000,
                 episode_length=500,
                 update_every=10,
                 entropy_beta=.005,
                 gamma=.99,
                 actor_layers=[500, 300],
                 critic_layers=[500, 300],
                 actor_lr=.00005,
                 critic_lr=.0001):

        # Training parameters
        self.gamma = gamma
        self.beta = entropy_beta
        self.max_episodes = max_episodes
        self.episode_length = episode_length
        self.update_every = update_every
        self.n_workers = n_workers
        self.actor_layers = actor_layers
        self.critic_layers = critic_layers
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Synchronization
        self.algorithm = 'a3c'
        self.env_name = env_name
        self.stop = False
        self.total_steps = 0
        self.update_counter = 0
        self.current_episode = 0
        self.running_reward = []
        self.episode_reward = []
        self.time = None
        self.verbose = True
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Rendering
        if render == 0:
            self.render = [True for _ in range(self.n_workers)]
        if render == 1:
            self.render = [True for _ in range(self.n_workers)]
            self.render[0] = False
        if render == 2:
            self.render = [False for _ in range(self.n_workers)]

        # Plotting
        self.plot = plot
        if self.plot:
            plt.ion()
            plt.figure(1)
            plt.plot()
            plt.xlabel('Episode')
            plt.ylabel('Running reward')
            plt.title('{} episode reward'.format(self.env_name))

        # Session and coordinator
        self.sess = tf.Session()
        self.tf_coordinator = tf.train.Coordinator()
        self.optimizer_actor = tf.train.RMSPropOptimizer(self.actor_lr,
                                                         name='RMSPropA')
        self.optimizer_critic = tf.train.RMSPropOptimizer(self.critic_lr,
                                                          name='RMSPropC')
        self.workers = []

        # Environment parameters
        print "Creating dummy environment to obtain the parameters..."
        try:
            env = nao_rl.make(env_name, headless=True)
        except:
            env = gym.make(env_name)
        self.n_states = env.observation_space.shape[0]
        self.n_actions = env.action_space.shape[0]
        self.action_bounds = [env.action_space.low, env.action_space.high]
        nao_rl.destroy_instances()
        del env

        self.initialize()
コード例 #6
0
 def _init():
     try:
         env = nao_rl.make(env_name)
     except:
         pass
     return env
コード例 #7
0
                while not self.done and steps < 200:
                    action = self.action_space.sample()
                    # action = np.zeros(4)
                    state, reward, self.done, _ = self.step(action)
                    time.sleep(1 / fps)
                    steps += 1
                    ep_reward += reward
                    # print(reward)
                    for i in range(2, 6):
                        data[i - 2].append(state[i])
                print('Steps: {}'.format(steps))
                t += 1
                print('Episode reward: {}'.format(ep_reward))
        except KeyboardInterrupt:
            pass
            import matplotlib.pyplot as plt
            for i in range(4):
                plt.plot(data[i])

            plt.legend(self.active_joints)
            plt.show()


if __name__ == "__main__":
    """
    If called as a script this will initialize the scene in an open vrep instance 
    """
    import nao_rl
    env = nao_rl.make('NaoReaching', headless=False, show_display=True)
    env.run()
    nao_rl.destroy_instances()
コード例 #8
0
ファイル: run.py プロジェクト: hojunroks/nao_rl
#!/usr/bin/python2
import os, sys
sys.path.append(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), 'nao_rl'))

import nao_rl
env = nao_rl.make('NaoTracking')
env.run(timeout=10)
env.close()
nao_rl.destroy_instances()
コード例 #9
0
ファイル: ppo.py プロジェクト: wwchung91/nao_rl
    def __init__(self,
                 env_name='',
                 n_workers=4,
                 max_episodes=5000,
                 episode_length=500,
                 batch_size=128,
                 epochs=10,
                 epsilon=.2,
                 gamma=.99,
                 actor_layers=[500, 500],
                 critic_layers=[500],
                 actor_lr=.00001,
                 critic_lr=.00002):

        # Training parameters
        self.gamma = gamma
        self.max_episodes = max_episodes
        self.episode_length = episode_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_workers = n_workers
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Synchronization
        self.env_name = env_name
        self.total_steps = 0
        self.update_counter = 0
        self.current_episode = 0
        self.running_reward = []
        self.time = None
        self.verbose = False

        # Threading and events
        self.update_event, self.rolling_event = threading.Event(
        ), threading.Event()
        self.tf_coordinator = tf.train.Coordinator()
        self.queue = queue.Queue()
        self.sess = tf.Session()

        # Environment parameters
        print "Creating dummy environment to obtain the parameters..."
        env = nao_rl.make(self.env_name, 19998)
        self.action_space = env.action_space.shape[0]
        self.state_space = env.observation_space.shape[0]
        self.action_bounds = [
            env.action_space.low[0], self.action_space.high[0]
        ]
        nao_rl.destroy_instances()
        del env

        ##############
        ### Network ##
        ##############

        # Input placeholders
        self.state_input = tf.placeholder(tf.float32, [None, self.state_space],
                                          ' state_input')
        self.action_input = tf.placeholder(tf.float32,
                                           [None, self.action_space],
                                           'action_input')
        self.advantage_input = tf.placeholder(tf.float32, [None, 1],
                                              'advantage')
        self.discounted_reward = tf.placeholder(tf.float32, [None, 1],
                                                'discounted_reward')

        ########
        # Critic
        hidden_layer = tf.layers.dense(self.state_input, critic_layers[0],
                                       tf.nn.relu)
        for layer_size in critic_layers[1::]:
            hidden_layer = tf.layers.dense(hidden_layer,
                                           critic_layers[layer_size],
                                           tf.nn.relu)
        self.critic_output = tf.layers.dense(hidden_layer, 1)

        self.advantage = self.discounted_reward - self.critic_output
        self.critic_loss = tf.reduce_mean(tf.square(self.advantage))
        self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize(
            self.critic_loss)

        #######
        # Actor
        policy, pi_params = self.build_actor('pi', True, actor_layers)
        old_policy, oldpi_params = self.build_actor('oldpi', False,
                                                    actor_layers)
        self.choose_action = tf.squeeze(policy.sample(1), axis=0)
        self.update_policy = [
            oldpolicy.assign(policy)
            for policy, oldpolicy in zip(pi_params, oldpi_params)
        ]
        ratio = policy.prob(
            self.action_input) / (old_policy.prob(self.action_input) + 1e-5)
        surrogate_loss = ratio * self.advantage_input

        # Clipped objective
        self.actor_loss = -tf.reduce_mean(
            tf.minimum(
                surrogate_loss,
                tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) *
                self.advantage_input))
        self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize(
            self.actor_loss)
        self.sess.run(tf.global_variables_initializer())
コード例 #10
0
ファイル: ppo.py プロジェクト: wwchung91/nao_rl
    def __init__(self,
                 env_name,
                 render,
                 plot,
                 n_workers=8,
                 max_episodes=5000,
                 episode_length=500,
                 batch_size=1000,
                 epochs=10,
                 epsilon=.2,
                 gamma=.99,
                 actor_layers=[250, 250],
                 critic_layers=[250],
                 actor_lr=.00001,
                 critic_lr=.00002):

        # Training parameters
        self.gamma = gamma
        self.max_episodes = max_episodes
        self.episode_length = episode_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_workers = n_workers
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Synchronization
        self.algorithm = 'ppo'
        self.env_name = env_name
        self.stop = False
        self.total_steps = 0
        self.update_counter = 0
        self.current_episode = 0
        self.running_reward = []
        self.episode_reward = []
        self.time = time.time()
        self.verbose = True
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Threading and events
        self.sess = tf.Session()
        self.tf_coordinator = tf.train.Coordinator()
        self.queue = queue.Queue()
        self.update_event = threading.Event()
        self.rollout = threading.Event()
        self.workers = []

        # Rendering
        if render == 0:
            self.render = [True for _ in range(self.n_workers)]
        if render == 1:
            self.render = [True for _ in range(self.n_workers)]
            self.render[0] = False
        if render == 2:
            self.render = [False for _ in range(self.n_workers)]

        # Plotting
        self.plot = plot
        if self.plot:
            plt.ion()
            plt.figure(1)
            plt.plot()
            plt.xlabel('Episode')
            plt.ylabel('Running reward')
            plt.title('{} episode reward'.format(self.env_name))

        # Environment parameters
        print "Creating dummy environment to obtain the parameters..."
        try:
            env = nao_rl.make(self.env_name, headless=True)
        except:
            env = gym.make(self.env_name)
        self.n_actions = env.action_space.shape[0]
        self.n_states = env.observation_space.shape[0]
        self.action_bounds = [
            env.action_space.low[0], -env.action_space.low[0]
        ]
        #env.disconnect()
        nao_rl.destroy_instances()
        del env

        ##############
        ### Network ##
        ##############

        # Input placeholders
        self.state_input = tf.placeholder(tf.float32, [None, self.n_states],
                                          'state_input')
        self.action_input = tf.placeholder(tf.float32, [None, self.n_actions],
                                           'action_input')
        self.advantage_input = tf.placeholder(tf.float32, [None, 1],
                                              'advantage')
        self.discounted_reward = tf.placeholder(tf.float32, [None, 1],
                                                'discounted_reward')

        ########
        # Critic
        hidden_layer = tf.layers.dense(self.state_input, critic_layers[0],
                                       tf.nn.relu)
        for layer_size in critic_layers[1::]:
            hidden_layer = tf.layers.dense(hidden_layer, layer_size,
                                           tf.nn.relu)
        self.critic_output = tf.layers.dense(hidden_layer, 1)

        self.advantage = self.discounted_reward - self.critic_output
        self.critic_loss = tf.reduce_mean(tf.square(self.advantage))
        self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize(
            self.critic_loss)

        #######
        # Actor
        policy, pi_params = self.build_actor('policy', True, actor_layers)
        old_policy, oldpi_params = self.build_actor('old_policy', False,
                                                    actor_layers)
        self.choose_action = tf.squeeze(policy.sample(1),
                                        axis=0,
                                        name='choose_action')
        self.update_policy = [
            old.assign(p) for p, old in zip(pi_params, oldpi_params)
        ]
        ratio = policy.prob(
            self.action_input) / (old_policy.prob(self.action_input) + 1e-5)
        surrogate_loss = ratio * self.advantage_input

        # Clipped objective
        self.actor_loss = -tf.reduce_mean(
            tf.minimum(
                surrogate_loss,
                tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) *
                self.advantage_input))
        self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize(
            self.actor_loss)
        self.sess.run(tf.global_variables_initializer())
コード例 #11
0
ファイル: NaoTracking.py プロジェクト: wwchung91/nao_rl
        Run the test simulation without any learning algorithm for debugging purposes
        """
        fps = 30.
        try:
            t = 0
            while t < 30:
                self.done = False
                self.reset()

                steps = 0
                while not self.done:
                    action = self.action_space.sample()
                    state, reward, self.done, _ = self.step(action)
                    time.sleep(1 / fps)
                    steps += 1
                    print(reward)

                print('Steps: {}'.format(steps))
                t += 1
        except KeyboardInterrupt:
            pass


if __name__ == "__main__":
    """
    If called as a script this will initialize the scene in an open vrep instance 
    """
    import nao_rl
    env = nao_rl.make('NaoTracking', headless=False, show_display=True)
    env.run()
    nao_rl.destroy_instances()
コード例 #12
0
from nao_rl.environments import NaoWalking
import nao_rl.settings as settings
from nao_rl.learning import models

import nao_rl

"""
Example file that uses Deep Deterministic Policy Gradient algorithm to train the agent (keras-rl)
Hyperparameters are subject to optimization
"""

if __name__ == "__main__":
    
    ENV_NAME = 'nao_bipedal'
    
    env = nao_rl.make(ENV_NAME, settings.SIM_PORT, headless=False, reinit=True)
    
    env.agent.connect(env, env.active_joints)

    model = models.build_ddpg_model(env,
                                    actor_hidden_layers=[80,80],
                                    critic_hidden_layers=[100,100], 
                                    gamma=0.99,
                                    learning_rate=0.001)


    # Train
    history = model.fit(env, nb_steps=200000, visualize=False, verbose=2, nb_max_episode_steps=200)
    filename = settings.TRAINED_MODELS + '/ddpg_{}_weights.h5f'.format(ENV_NAME)
    #agent.save_weights(filename, overwrite=True)
    env.stop_simulation()
コード例 #13
0
        Run the test simulation without any learning algorithm for debugging purposes
        """
        
        t = 0
        while t < 10:
            self.reset()
            self.start_simulation()
            while not self.done:
                raw_input("Press Enter to continue...")
                action = self.action_space.sample()
                print(action)
                state, reward, self.done, _ = self.step(action)
                print('Current state:\n angles: {}'.format(state))
                print('Reward: {}'.format(reward))

            self.stop_simulation()
            t += 1


if __name__ == "__main__":
    
    """
    If called as a script this will initialize the scene in an open vrep instance 
    """

    # Environment and objects
    import nao_rl
    # scene = settings.SCENES + '/nao_test2.ttt'
    env = nao_rl.make('NaoWalking', headless=False)
    env.run()
コード例 #14
0
ファイル: NaoTracking.py プロジェクト: hojunroks/nao_rl
        """
        fps = 30.
        try:
            t = 0
            while t < timeout:
                self.done = False
                self.reset()

                steps = 0
                while not self.done:
                    action = self.action_space.sample()
                    state, reward, self.done, _ = self.step(action)
                    time.sleep(1 / fps)
                    steps += 1
                    print(reward)

                print('Steps: {}'.format(steps))
                t += 1
        except KeyboardInterrupt:
            pass


if __name__ == "__main__":
    """
    If called as a script this will initialize the scene in an open vrep instance 
    """
    import nao_rl
    env = nao_rl.make('NaoTracking', headless=False)
    env.run()
    nao_rl.destroy_instances()