def main(_): global master_network global global_episodes reward_mode = None if len(sys.argv) < 2: # general params # training params # PICK ONE and comment others params = PARAMS['CE3-CE4'] # params = PARAMS['CE3-CC4'] # params = PARAMS['CC3-CE4'] # params = PARAMS['CC3-CC4'] # params = PARAMS['CE4'] # params = PARAMS['CC4'] else: setting = sys.argv[1] params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]] print(('training_scenario: {}, testing_scenario: {}'.format( params['train_scenario_name'], params['test_scenario_name']))) reward_mode = sys.argv[2] use_physics = False num_training_iters = 100 # RL specific settings params['data_dir'] = '../../OpenLockA3CResults/subjects/' params['train_attempt_limit'] = 300 params['test_attempt_limit'] = 300 params['use_physics'] = False params['num_training_iters'] = 100 params['reward_mode'] = reward_mode scenario = select_scenario(params['train_scenario_name'], use_physics=use_physics) ENV_NAME = 'arm_lock-v0' env = gym.make(ENV_NAME) # create session/trial/experiment manager manager = SessionManager(env, params, human=False) manager.update_scenario(scenario) trial_selected = manager.run_trial_common_setup( scenario_name=params['train_scenario_name'], action_limit=params['train_action_limit'], attempt_limit=params['train_attempt_limit']) env.observation_space = ObservationSpace(len(scenario.levers)) MODEL_DIR = manager.writer.subject_path + '/models' MONITOR_DIR = manager.writer.subject_path + '/monitor' STATE_DIM = env.observation_space.shape ACTION_DIM = len(env.action_space) # delete temporary env env.close() tf.reset_default_graph() if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) with tf.device("/cpu:0"): np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) global_episodes = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) master_network = AC_Network(STATE_DIM, ACTION_DIM, CELL_UNITS, 'global', None) # Generate global network num_workers = multiprocessing.cpu_count( ) # Set workers to number of available CPU threads # For testing and visualisation we only need one worker if TEST_MODEL: num_workers = 1 workers = [] # Create worker classes for i in range(num_workers): workers.append( Worker(name=i, s_size=STATE_DIM, a_size=ACTION_DIM, trainer=trainer, model_path=MODEL_DIR, global_episodes=global_episodes, env_name=ENV_NAME, seed=RANDOM_SEED, test=TEST_MODEL, cell_units=CELL_UNITS, params=params)) saver = tf.train.Saver(max_to_keep=5) # Gym monitor if not TEST_MODEL: env = workers[0].get_env() env = gym.wrappers.Monitor(env, MONITOR_DIR, video_callable=False, force=True) with tf.Session() as sess: coord = tf.train.Coordinator() if LOAD_MODEL or TEST_MODEL: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(MODEL_DIR) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) if TEST_MODEL: env = workers[0].get_env() env = gym.wrappers.Monitor(env, MONITOR_DIR, force=True) workers[0].work(GAMMA, sess, coord, saver) else: # This is where the asynchronous magic happens. # Start the "work" process for each worker in a separate thread. print('Launching workers...') worker_threads = [] for worker in workers: worker_work = lambda: worker.work(GAMMA, sess, coord, saver) t = threading.Thread(target=(worker_work)) t.start() worker_threads.append(t) coord.join(worker_threads)
class Worker(): def __init__(self, name, s_size, a_size, trainer, model_path, global_episodes, env_name, seed, test, cell_units, params, testing_trial=False): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) self.is_test = test self.a_size = a_size self.params = params # Create the local copy of the network and the tensorflow op to copy global parameters to local network self.local_AC = AC_Network(s_size, a_size, cell_units, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.testing_trial = testing_trial if not self.testing_trial: self.scenario_name = params['train_scenario_name'] self.attempt_limit = params['train_attempt_limit'] else: self.scenario_name = params['test_scenario_name'] self.attempt_limit = params['test_attempt_limit'] self.scenario = select_scenario(self.scenario_name, params['use_physics']) env = gym.make(env_name) self.manager = SessionManager(env, params, human=False) self.manager.update_scenario(self.scenario) self.manager.env.reward_mode = params['reward_mode'] self.trial_count = 0 self.manager.env.seed(seed) def get_env(self): return self.manager.env def train(self, rollout, sess, gamma, r): rollout = np.array(rollout) states = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] values = rollout[:, 5] # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. rewards_list = np.asarray(rewards.tolist()+[r])*REWARD_FACTOR discounted_rewards = discounting(rewards_list, gamma)[:-1] # Advantage estimation # JS, P Moritz, S Levine, M Jordan, P Abbeel, # "High-dimensional continuous control using generalized advantage estimation." # arXiv preprint arXiv:1506.02438 (2015). values_list = np.asarray(values.tolist()+[r])*REWARD_FACTOR advantages = rewards + gamma * values_list[1:] - values_list[:-1] discounted_advantages = discounting(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save # sess.run(self.local_AC.reset_state_op) rnn_state = self.local_AC.state_init feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.inputs: np.vstack(states), self.local_AC.actions: np.vstack(actions), self.local_AC.advantages: discounted_advantages, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]} v_l, p_l, e_l, g_n, v_n, _ = sess.run([self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads], feed_dict=feed_dict) return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n def work(self, gamma, sess, coord, saver): episode_count = sess.run(self.global_episodes) total_steps = 0 print("Starting worker " + str(self.number)) with sess.as_default(), sess.graph.as_default(): sess.run(self.update_local_ops) episode_buffer = [] episode_mini_buffer = [] episode_values = [] episode_states = [] episode_reward = 0 episode_step_count = 0 if not self.testing_trial: trial_selected = self.manager.run_trial_common_setup(self.params['train_scenario_name'], self.params['train_action_limit'], self.params['train_attempt_limit'], multithreaded=True) else: trial_selected = self.manager.run_trial_common_setup(self.params['test_scenario_name'], self.params['test_action_limit'], self.params['test_attempt_limit'], specified_trial='trial7', multithreaded=True) self.manager.env.reset() while not coord.should_stop(): # update trial if needed if self.manager.env.attempt_count > self.attempt_limit or self.manager.logger.cur_trial.success is True: if not self.testing_trial: trial_selected = self.manager.run_trial_common_setup(self.params['train_scenario_name'], self.params['train_action_limit'], self.params['train_attempt_limit'], multithreaded=True) else: trial_selected = self.manager.run_trial_common_setup(self.params['test_scenario_name'], self.params['test_action_limit'], self.params['test_attempt_limit'], specified_trial='trial7', multithreaded=True) print('scenario_name: {}, trial_count: {}, trial_name: {}'.format(self.scenario_name, self.trial_count, trial_selected)) sess.run(self.update_local_ops) episode_buffer = [] episode_mini_buffer = [] episode_values = [] episode_states = [] episode_reward = 0 episode_step_count = 0 self.trial_count += 1 self.manager.env.reset() # Restart environment done = False state = self.manager.env.reset() rnn_state = self.local_AC.state_init # Run an episode while not done: episode_states.append(state) if self.is_test: self.manager.env.render() # Get preferred action distribution a_dist, v, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value, self.local_AC.state_out], feed_dict={self.local_AC.inputs: [state], self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]}) a0 = weighted_pick(a_dist[0], 1) # Use stochastic distribution sampling if self.is_test: a0 = np.argmax(a_dist[0]) # Use maximum when testing a = np.zeros(self.a_size) a[a0] = 1 next_state, reward, done, opt = self.manager.env.step(np.argmax(a), multithreaded=False) episode_reward += reward episode_buffer.append([state, a, reward, next_state, done, v[0, 0]]) episode_mini_buffer.append([state, a, reward, next_state, done, v[0, 0]]) episode_values.append(v[0, 0]) # Train on mini batches from episode if len(episode_mini_buffer) == MINI_BATCH and not self.is_test: v1 = sess.run([self.local_AC.value], feed_dict={self.local_AC.inputs: [state], self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]}) v_l, p_l, e_l, g_n, v_n = self.train(episode_mini_buffer, sess, gamma, v1[0][0]) episode_mini_buffer = [] # Set previous state for next step state = next_state total_steps += 1 episode_step_count += 1 self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append(np.mean(episode_values)) if episode_count % 100 == 0 and not episode_count % 1000 == 0 and not self.is_test: mean_reward = np.mean(self.episode_rewards[-5:]) mean_length = np.mean(self.episode_lengths[-5:]) mean_value = np.mean(self.episode_mean_values[-5:]) summary = tf.Summary() summary.value.add(tag='Scenario name', simple_value=str(self.manager.env.scenario.name)) summary.value.add(tag='trial count', simple_value=str(self.trial_count)) summary.value.add(tag='trial name', simple_value=str(trial_selected)) summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) self.summary_writer.add_summary(summary, episode_count) self.summary_writer.flush() if self.name == 'worker_0': if episode_count % 1000 == 0 and not self.is_test: saver.save(sess, self.model_path + '/model-' + str(episode_count) + '.cptk') print("| Reward: " + str(episode_reward), " | Episode", episode_count) sess.run(self.increment) # Next global episode episode_count += 1