def grid_search(): """ Script for trying all combinations of parameters specified """ # PARAMETERS number_of_repeats = 1 # Number of iterations per each combination of params parameters = { 'env_name': ['NaoBalancing'], 'n_workers': [4], 'max_episodes': [6500], 'episode_length': [2000], 'batch_size': [2000], 'epochs': [8], 'epsilon': [.2], 'gamma': [.99], 'actor_layers': [[256, 256]], 'critic_layers': [[256, 256]], 'actor_lr': [.00001], 'critic_lr': [.00002] } values = tuple(parameters.values()) param_iterator = list(itertools.product(*values)) data = [] counter = 0 for params in param_iterator: counter += 1 args = dict(zip(parameters.keys(), params)) for i in range(number_of_repeats): print "\nIteration {} of parameter set {}/{}\nParameters:".format( i + 1, counter, len(param_iterator)) print args nao_rl.destroy_instances() time.sleep(.5) model = PPO(**args) model.train() date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") filename = s.MAIN_DIR + '/data/' + parameters['env_name'][ 0] + '_' + date + '.log' log = args.copy() log['iteration'] = i log['exp_number'] = '{}/{}'.format(counter, len(param_iterator)) log['global_reward'] = model.running_reward log['episode_reward'] = model.episode_reward log['date'] = date log['model_path'] = '' data.append(model.running_reward) model.close_session() del model with open(filename, 'w') as logfile: logfile.write(json.dumps(log)) return data
def kill(self): try: nao_rl.destroy_instances() except: pass try: self.manager.trainer.env.render(close=True) except: pass
Run the test simulation without any learning algorithm for debugging purposes """ t = 0 while t < 30: self.done = False self.reset() fps = 30. while not self.done: # raw_input("Press Enter to continue...") action = self.action_space.sample() print(action) state, reward, self.done, _ = self.step(action) print('Current state:\n angles: {}'.format(state)) print('Reward: {}'.format(reward)) time.sleep(1 / fps) t += 1 if __name__ == "__main__": """ If called as a script this will initialize the scene in an open vrep instance """ # Environment and objects import nao_rl env = nao_rl.make('NaoBalancing', headless=False) env.run() nao_rl.destroy_instances()
def __init__(self, env_name, render, plot, n_workers=1, max_episodes=10000, episode_length=500, update_every=10, entropy_beta=.005, gamma=.99, actor_layers=[500, 300], critic_layers=[500, 300], actor_lr=.00005, critic_lr=.0001): # Training parameters self.gamma = gamma self.beta = entropy_beta self.max_episodes = max_episodes self.episode_length = episode_length self.update_every = update_every self.n_workers = n_workers self.actor_layers = actor_layers self.critic_layers = critic_layers self.actor_lr = actor_lr self.critic_lr = critic_lr # Synchronization self.algorithm = 'a3c' self.env_name = env_name self.stop = False self.total_steps = 0 self.update_counter = 0 self.current_episode = 0 self.running_reward = [] self.episode_reward = [] self.time = None self.verbose = True self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Rendering if render == 0: self.render = [True for _ in range(self.n_workers)] if render == 1: self.render = [True for _ in range(self.n_workers)] self.render[0] = False if render == 2: self.render = [False for _ in range(self.n_workers)] # Plotting self.plot = plot if self.plot: plt.ion() plt.figure(1) plt.plot() plt.xlabel('Episode') plt.ylabel('Running reward') plt.title('{} episode reward'.format(self.env_name)) # Session and coordinator self.sess = tf.Session() self.tf_coordinator = tf.train.Coordinator() self.optimizer_actor = tf.train.RMSPropOptimizer(self.actor_lr, name='RMSPropA') self.optimizer_critic = tf.train.RMSPropOptimizer(self.critic_lr, name='RMSPropC') self.workers = [] # Environment parameters print "Creating dummy environment to obtain the parameters..." try: env = nao_rl.make(env_name, headless=True) except: env = gym.make(env_name) self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.action_bounds = [env.action_space.low, env.action_space.high] nao_rl.destroy_instances() del env self.initialize()
def __init__(self, env_name='', n_workers=4, max_episodes=5000, episode_length=500, batch_size=128, epochs=10, epsilon=.2, gamma=.99, actor_layers=[500, 500], critic_layers=[500], actor_lr=.00001, critic_lr=.00002): # Training parameters self.gamma = gamma self.max_episodes = max_episodes self.episode_length = episode_length self.batch_size = batch_size self.epochs = epochs self.n_workers = n_workers self.actor_lr = actor_lr self.critic_lr = critic_lr # Synchronization self.env_name = env_name self.total_steps = 0 self.update_counter = 0 self.current_episode = 0 self.running_reward = [] self.time = None self.verbose = False # Threading and events self.update_event, self.rolling_event = threading.Event( ), threading.Event() self.tf_coordinator = tf.train.Coordinator() self.queue = queue.Queue() self.sess = tf.Session() # Environment parameters print "Creating dummy environment to obtain the parameters..." env = nao_rl.make(self.env_name, 19998) self.action_space = env.action_space.shape[0] self.state_space = env.observation_space.shape[0] self.action_bounds = [ env.action_space.low[0], self.action_space.high[0] ] nao_rl.destroy_instances() del env ############## ### Network ## ############## # Input placeholders self.state_input = tf.placeholder(tf.float32, [None, self.state_space], ' state_input') self.action_input = tf.placeholder(tf.float32, [None, self.action_space], 'action_input') self.advantage_input = tf.placeholder(tf.float32, [None, 1], 'advantage') self.discounted_reward = tf.placeholder(tf.float32, [None, 1], 'discounted_reward') ######## # Critic hidden_layer = tf.layers.dense(self.state_input, critic_layers[0], tf.nn.relu) for layer_size in critic_layers[1::]: hidden_layer = tf.layers.dense(hidden_layer, critic_layers[layer_size], tf.nn.relu) self.critic_output = tf.layers.dense(hidden_layer, 1) self.advantage = self.discounted_reward - self.critic_output self.critic_loss = tf.reduce_mean(tf.square(self.advantage)) self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize( self.critic_loss) ####### # Actor policy, pi_params = self.build_actor('pi', True, actor_layers) old_policy, oldpi_params = self.build_actor('oldpi', False, actor_layers) self.choose_action = tf.squeeze(policy.sample(1), axis=0) self.update_policy = [ oldpolicy.assign(policy) for policy, oldpolicy in zip(pi_params, oldpi_params) ] ratio = policy.prob( self.action_input) / (old_policy.prob(self.action_input) + 1e-5) surrogate_loss = ratio * self.advantage_input # Clipped objective self.actor_loss = -tf.reduce_mean( tf.minimum( surrogate_loss, tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) * self.advantage_input)) self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize( self.actor_loss) self.sess.run(tf.global_variables_initializer())
def __init__(self, env_name, render, plot, n_workers=8, max_episodes=5000, episode_length=500, batch_size=1000, epochs=10, epsilon=.2, gamma=.99, actor_layers=[250, 250], critic_layers=[250], actor_lr=.00001, critic_lr=.00002): # Training parameters self.gamma = gamma self.max_episodes = max_episodes self.episode_length = episode_length self.batch_size = batch_size self.epochs = epochs self.n_workers = n_workers self.actor_lr = actor_lr self.critic_lr = critic_lr # Synchronization self.algorithm = 'ppo' self.env_name = env_name self.stop = False self.total_steps = 0 self.update_counter = 0 self.current_episode = 0 self.running_reward = [] self.episode_reward = [] self.time = time.time() self.verbose = True self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Threading and events self.sess = tf.Session() self.tf_coordinator = tf.train.Coordinator() self.queue = queue.Queue() self.update_event = threading.Event() self.rollout = threading.Event() self.workers = [] # Rendering if render == 0: self.render = [True for _ in range(self.n_workers)] if render == 1: self.render = [True for _ in range(self.n_workers)] self.render[0] = False if render == 2: self.render = [False for _ in range(self.n_workers)] # Plotting self.plot = plot if self.plot: plt.ion() plt.figure(1) plt.plot() plt.xlabel('Episode') plt.ylabel('Running reward') plt.title('{} episode reward'.format(self.env_name)) # Environment parameters print "Creating dummy environment to obtain the parameters..." try: env = nao_rl.make(self.env_name, headless=True) except: env = gym.make(self.env_name) self.n_actions = env.action_space.shape[0] self.n_states = env.observation_space.shape[0] self.action_bounds = [ env.action_space.low[0], -env.action_space.low[0] ] #env.disconnect() nao_rl.destroy_instances() del env ############## ### Network ## ############## # Input placeholders self.state_input = tf.placeholder(tf.float32, [None, self.n_states], 'state_input') self.action_input = tf.placeholder(tf.float32, [None, self.n_actions], 'action_input') self.advantage_input = tf.placeholder(tf.float32, [None, 1], 'advantage') self.discounted_reward = tf.placeholder(tf.float32, [None, 1], 'discounted_reward') ######## # Critic hidden_layer = tf.layers.dense(self.state_input, critic_layers[0], tf.nn.relu) for layer_size in critic_layers[1::]: hidden_layer = tf.layers.dense(hidden_layer, layer_size, tf.nn.relu) self.critic_output = tf.layers.dense(hidden_layer, 1) self.advantage = self.discounted_reward - self.critic_output self.critic_loss = tf.reduce_mean(tf.square(self.advantage)) self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize( self.critic_loss) ####### # Actor policy, pi_params = self.build_actor('policy', True, actor_layers) old_policy, oldpi_params = self.build_actor('old_policy', False, actor_layers) self.choose_action = tf.squeeze(policy.sample(1), axis=0, name='choose_action') self.update_policy = [ old.assign(p) for p, old in zip(pi_params, oldpi_params) ] ratio = policy.prob( self.action_input) / (old_policy.prob(self.action_input) + 1e-5) surrogate_loss = ratio * self.advantage_input # Clipped objective self.actor_loss = -tf.reduce_mean( tf.minimum( surrogate_loss, tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) * self.advantage_input)) self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize( self.actor_loss) self.sess.run(tf.global_variables_initializer())