def create_workers(self): """ Initialize environments """ self.workers = [] for i in range(self.n_workers): env = nao_rl.make(self.env_name, 19998 - i, headless=True) worker = Worker(env, self, i) worker.env.agent.connect(worker.env, worker.env.active_joints) ### IMPROVE self.workers.append(worker) self.time = time.time()
def create_workers(self): """ Initialize environments """ for i in range(self.n_workers): print "\nCreating worker #{}...".format(i + 1) try: env = nao_rl.make(self.env_name, headless=self.render[i]) except: env = gym.make(self.env_name) worker = Worker(env, 'Worker_{}'.format(i + 1), self) self.workers.append(worker)
""" Run the test simulation without any learning algorithm for debugging purposes """ t = 0 while t < 30: self.done = False self.reset() fps = 30. while not self.done: # raw_input("Press Enter to continue...") action = self.action_space.sample() print(action) state, reward, self.done, _ = self.step(action) print('Current state:\n angles: {}'.format(state)) print('Reward: {}'.format(reward)) time.sleep(1 / fps) t += 1 if __name__ == "__main__": """ If called as a script this will initialize the scene in an open vrep instance """ # Environment and objects import nao_rl env = nao_rl.make('NaoBalancing', headless=False) env.run() nao_rl.destroy_instances()
log.load_from_file(nao_rl.settings.DATA + filename.split('.')[0] + '.log') log.summary() except: print "Could not load the log file from '/data' directory" # Balancing # name = 'NaoBalancing_a3c_2019-01-11_10:52:46.cpkt' # name = 'NaoBalancing_a3c_2019-01-11_11:50:59.cpkt' # # Walking # name = 'walking.cpkt' # Tracking # name = 'NaoTracking_a3c_2019-01-11_12:01:41.cpkt' # Create environment env = nao_rl.make(env_name, headless=False) fps = 30. # Test Loop n = 0 while n < n_attempts: total_reward = 0 steps = 0 done = False state = env.reset() # Test loop while not done: raw_input('ENTER TO CONTINUE...') action = np.clip(model.action(state), env.action_space.low, env.action_space.high) # action = env.f() state, reward, done, _ = env.step(np.array(action))
def __init__(self, env_name, render, plot, n_workers=1, max_episodes=10000, episode_length=500, update_every=10, entropy_beta=.005, gamma=.99, actor_layers=[500, 300], critic_layers=[500, 300], actor_lr=.00005, critic_lr=.0001): # Training parameters self.gamma = gamma self.beta = entropy_beta self.max_episodes = max_episodes self.episode_length = episode_length self.update_every = update_every self.n_workers = n_workers self.actor_layers = actor_layers self.critic_layers = critic_layers self.actor_lr = actor_lr self.critic_lr = critic_lr # Synchronization self.algorithm = 'a3c' self.env_name = env_name self.stop = False self.total_steps = 0 self.update_counter = 0 self.current_episode = 0 self.running_reward = [] self.episode_reward = [] self.time = None self.verbose = True self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Rendering if render == 0: self.render = [True for _ in range(self.n_workers)] if render == 1: self.render = [True for _ in range(self.n_workers)] self.render[0] = False if render == 2: self.render = [False for _ in range(self.n_workers)] # Plotting self.plot = plot if self.plot: plt.ion() plt.figure(1) plt.plot() plt.xlabel('Episode') plt.ylabel('Running reward') plt.title('{} episode reward'.format(self.env_name)) # Session and coordinator self.sess = tf.Session() self.tf_coordinator = tf.train.Coordinator() self.optimizer_actor = tf.train.RMSPropOptimizer(self.actor_lr, name='RMSPropA') self.optimizer_critic = tf.train.RMSPropOptimizer(self.critic_lr, name='RMSPropC') self.workers = [] # Environment parameters print "Creating dummy environment to obtain the parameters..." try: env = nao_rl.make(env_name, headless=True) except: env = gym.make(env_name) self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.action_bounds = [env.action_space.low, env.action_space.high] nao_rl.destroy_instances() del env self.initialize()
def _init(): try: env = nao_rl.make(env_name) except: pass return env
while not self.done and steps < 200: action = self.action_space.sample() # action = np.zeros(4) state, reward, self.done, _ = self.step(action) time.sleep(1 / fps) steps += 1 ep_reward += reward # print(reward) for i in range(2, 6): data[i - 2].append(state[i]) print('Steps: {}'.format(steps)) t += 1 print('Episode reward: {}'.format(ep_reward)) except KeyboardInterrupt: pass import matplotlib.pyplot as plt for i in range(4): plt.plot(data[i]) plt.legend(self.active_joints) plt.show() if __name__ == "__main__": """ If called as a script this will initialize the scene in an open vrep instance """ import nao_rl env = nao_rl.make('NaoReaching', headless=False, show_display=True) env.run() nao_rl.destroy_instances()
#!/usr/bin/python2 import os, sys sys.path.append( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'nao_rl')) import nao_rl env = nao_rl.make('NaoTracking') env.run(timeout=10) env.close() nao_rl.destroy_instances()
def __init__(self, env_name='', n_workers=4, max_episodes=5000, episode_length=500, batch_size=128, epochs=10, epsilon=.2, gamma=.99, actor_layers=[500, 500], critic_layers=[500], actor_lr=.00001, critic_lr=.00002): # Training parameters self.gamma = gamma self.max_episodes = max_episodes self.episode_length = episode_length self.batch_size = batch_size self.epochs = epochs self.n_workers = n_workers self.actor_lr = actor_lr self.critic_lr = critic_lr # Synchronization self.env_name = env_name self.total_steps = 0 self.update_counter = 0 self.current_episode = 0 self.running_reward = [] self.time = None self.verbose = False # Threading and events self.update_event, self.rolling_event = threading.Event( ), threading.Event() self.tf_coordinator = tf.train.Coordinator() self.queue = queue.Queue() self.sess = tf.Session() # Environment parameters print "Creating dummy environment to obtain the parameters..." env = nao_rl.make(self.env_name, 19998) self.action_space = env.action_space.shape[0] self.state_space = env.observation_space.shape[0] self.action_bounds = [ env.action_space.low[0], self.action_space.high[0] ] nao_rl.destroy_instances() del env ############## ### Network ## ############## # Input placeholders self.state_input = tf.placeholder(tf.float32, [None, self.state_space], ' state_input') self.action_input = tf.placeholder(tf.float32, [None, self.action_space], 'action_input') self.advantage_input = tf.placeholder(tf.float32, [None, 1], 'advantage') self.discounted_reward = tf.placeholder(tf.float32, [None, 1], 'discounted_reward') ######## # Critic hidden_layer = tf.layers.dense(self.state_input, critic_layers[0], tf.nn.relu) for layer_size in critic_layers[1::]: hidden_layer = tf.layers.dense(hidden_layer, critic_layers[layer_size], tf.nn.relu) self.critic_output = tf.layers.dense(hidden_layer, 1) self.advantage = self.discounted_reward - self.critic_output self.critic_loss = tf.reduce_mean(tf.square(self.advantage)) self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize( self.critic_loss) ####### # Actor policy, pi_params = self.build_actor('pi', True, actor_layers) old_policy, oldpi_params = self.build_actor('oldpi', False, actor_layers) self.choose_action = tf.squeeze(policy.sample(1), axis=0) self.update_policy = [ oldpolicy.assign(policy) for policy, oldpolicy in zip(pi_params, oldpi_params) ] ratio = policy.prob( self.action_input) / (old_policy.prob(self.action_input) + 1e-5) surrogate_loss = ratio * self.advantage_input # Clipped objective self.actor_loss = -tf.reduce_mean( tf.minimum( surrogate_loss, tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) * self.advantage_input)) self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize( self.actor_loss) self.sess.run(tf.global_variables_initializer())
def __init__(self, env_name, render, plot, n_workers=8, max_episodes=5000, episode_length=500, batch_size=1000, epochs=10, epsilon=.2, gamma=.99, actor_layers=[250, 250], critic_layers=[250], actor_lr=.00001, critic_lr=.00002): # Training parameters self.gamma = gamma self.max_episodes = max_episodes self.episode_length = episode_length self.batch_size = batch_size self.epochs = epochs self.n_workers = n_workers self.actor_lr = actor_lr self.critic_lr = critic_lr # Synchronization self.algorithm = 'ppo' self.env_name = env_name self.stop = False self.total_steps = 0 self.update_counter = 0 self.current_episode = 0 self.running_reward = [] self.episode_reward = [] self.time = time.time() self.verbose = True self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Threading and events self.sess = tf.Session() self.tf_coordinator = tf.train.Coordinator() self.queue = queue.Queue() self.update_event = threading.Event() self.rollout = threading.Event() self.workers = [] # Rendering if render == 0: self.render = [True for _ in range(self.n_workers)] if render == 1: self.render = [True for _ in range(self.n_workers)] self.render[0] = False if render == 2: self.render = [False for _ in range(self.n_workers)] # Plotting self.plot = plot if self.plot: plt.ion() plt.figure(1) plt.plot() plt.xlabel('Episode') plt.ylabel('Running reward') plt.title('{} episode reward'.format(self.env_name)) # Environment parameters print "Creating dummy environment to obtain the parameters..." try: env = nao_rl.make(self.env_name, headless=True) except: env = gym.make(self.env_name) self.n_actions = env.action_space.shape[0] self.n_states = env.observation_space.shape[0] self.action_bounds = [ env.action_space.low[0], -env.action_space.low[0] ] #env.disconnect() nao_rl.destroy_instances() del env ############## ### Network ## ############## # Input placeholders self.state_input = tf.placeholder(tf.float32, [None, self.n_states], 'state_input') self.action_input = tf.placeholder(tf.float32, [None, self.n_actions], 'action_input') self.advantage_input = tf.placeholder(tf.float32, [None, 1], 'advantage') self.discounted_reward = tf.placeholder(tf.float32, [None, 1], 'discounted_reward') ######## # Critic hidden_layer = tf.layers.dense(self.state_input, critic_layers[0], tf.nn.relu) for layer_size in critic_layers[1::]: hidden_layer = tf.layers.dense(hidden_layer, layer_size, tf.nn.relu) self.critic_output = tf.layers.dense(hidden_layer, 1) self.advantage = self.discounted_reward - self.critic_output self.critic_loss = tf.reduce_mean(tf.square(self.advantage)) self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize( self.critic_loss) ####### # Actor policy, pi_params = self.build_actor('policy', True, actor_layers) old_policy, oldpi_params = self.build_actor('old_policy', False, actor_layers) self.choose_action = tf.squeeze(policy.sample(1), axis=0, name='choose_action') self.update_policy = [ old.assign(p) for p, old in zip(pi_params, oldpi_params) ] ratio = policy.prob( self.action_input) / (old_policy.prob(self.action_input) + 1e-5) surrogate_loss = ratio * self.advantage_input # Clipped objective self.actor_loss = -tf.reduce_mean( tf.minimum( surrogate_loss, tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) * self.advantage_input)) self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize( self.actor_loss) self.sess.run(tf.global_variables_initializer())
Run the test simulation without any learning algorithm for debugging purposes """ fps = 30. try: t = 0 while t < 30: self.done = False self.reset() steps = 0 while not self.done: action = self.action_space.sample() state, reward, self.done, _ = self.step(action) time.sleep(1 / fps) steps += 1 print(reward) print('Steps: {}'.format(steps)) t += 1 except KeyboardInterrupt: pass if __name__ == "__main__": """ If called as a script this will initialize the scene in an open vrep instance """ import nao_rl env = nao_rl.make('NaoTracking', headless=False, show_display=True) env.run() nao_rl.destroy_instances()
from nao_rl.environments import NaoWalking import nao_rl.settings as settings from nao_rl.learning import models import nao_rl """ Example file that uses Deep Deterministic Policy Gradient algorithm to train the agent (keras-rl) Hyperparameters are subject to optimization """ if __name__ == "__main__": ENV_NAME = 'nao_bipedal' env = nao_rl.make(ENV_NAME, settings.SIM_PORT, headless=False, reinit=True) env.agent.connect(env, env.active_joints) model = models.build_ddpg_model(env, actor_hidden_layers=[80,80], critic_hidden_layers=[100,100], gamma=0.99, learning_rate=0.001) # Train history = model.fit(env, nb_steps=200000, visualize=False, verbose=2, nb_max_episode_steps=200) filename = settings.TRAINED_MODELS + '/ddpg_{}_weights.h5f'.format(ENV_NAME) #agent.save_weights(filename, overwrite=True) env.stop_simulation()
Run the test simulation without any learning algorithm for debugging purposes """ t = 0 while t < 10: self.reset() self.start_simulation() while not self.done: raw_input("Press Enter to continue...") action = self.action_space.sample() print(action) state, reward, self.done, _ = self.step(action) print('Current state:\n angles: {}'.format(state)) print('Reward: {}'.format(reward)) self.stop_simulation() t += 1 if __name__ == "__main__": """ If called as a script this will initialize the scene in an open vrep instance """ # Environment and objects import nao_rl # scene = settings.SCENES + '/nao_test2.ttt' env = nao_rl.make('NaoWalking', headless=False) env.run()
""" fps = 30. try: t = 0 while t < timeout: self.done = False self.reset() steps = 0 while not self.done: action = self.action_space.sample() state, reward, self.done, _ = self.step(action) time.sleep(1 / fps) steps += 1 print(reward) print('Steps: {}'.format(steps)) t += 1 except KeyboardInterrupt: pass if __name__ == "__main__": """ If called as a script this will initialize the scene in an open vrep instance """ import nao_rl env = nao_rl.make('NaoTracking', headless=False) env.run() nao_rl.destroy_instances()