def main(): args = parse_args() logger.configure() gamma = 0.99 tau = 0.01 normalize_returns = False normalize_observations = True batch_size = 64 action_noise = None stddev = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) critic_l2_reg = 1e-2 actor_lr = 1e-4 critic_lr = 1e-3 popart = False clip_norm = None reward_scale = 1. env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) top_model_dir = 'top-models/' # create tf sessions and graphs sess_list = [] graph_list = [] for i in range(len(args.model_files)): graph_list.append(tf.Graph()) sess_list.append(tf.Session(graph=graph_list[i])) ddpg_agents = [] for i in range(len(args.model_files)): model_name = args.model_files[i] sess = sess_list[i] graph = graph_list[i] l_size = args.layer_sizes[i] with sess.as_default(): #with U.make_session(num_cpu=1, graph=g) as sess: with graph.as_default(): #tf.global_variables_initializer() # restore agents from model files and store in ddpg_agents print("Restoring from..." + model_name) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) actor = Actor(env.action_space.shape[-1], layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # restore adam state and param noise restore_model_path = top_model_dir + model_name saver = tf.train.Saver(max_to_keep=500) # restore network weights saver.restore(sess, restore_model_path) adam_optimizer_store = pickle.load(open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] # intialize and prepare agent session. agent.initialize(sess) #sess.graph.finalize() agent.reset() ddpg_agents.append(agent) agent = BlendedAgent(ddpg_agents, sess_list, graph_list) if args.evaluation: # setup eval env eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) nb_eval_steps = 1000 # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list, # nb_eval_steps=nb_eval_steps, # render=False) reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False) print("Reward: " + str(reward)) print("Mean Q: " + str(mean_q)) print("Final num steps: " + str(final_steps)) # Submit to crowdai competition. What a hack. :) # if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if args.crowdai_submit: remote_base = "http://grader.crowdai.org:1729" crowdai_client = Client(remote_base) eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action(action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True) # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug("done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) crowdai_client.submit() for i in range(len(sess_list)): sess_list[i].close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, saved_model_basename, restore_model_name, crowdai_client, crowdai_token, reward_shaping, feature_embellishment, relative_x_pos, relative_z_pos, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saved_model_dir = 'saved-models/' if saved_model_basename is None: saved_model_basename = ''.join( random.choices(string.ascii_lowercase + string.digits, k=8)) saved_model_path = saved_model_dir + saved_model_basename if restore_model_name: restore_model_path = restore_model_name if not pathlib.Path(restore_model_path + '.index').is_file(): restore_model_path = saved_model_dir + restore_model_name max_to_keep = 500 eval_reward_threshold_to_keep = 300 saver = tf.train.Saver(max_to_keep=max_to_keep) adam_optimizer_store = dict() adam_optimizer_store['actor_optimizer'] = dict() adam_optimizer_store['critic_optimizer'] = dict() #eval_episode_rewards_history = deque(maxlen=100) #episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: try: if restore_model_name: logger.info("Restoring from model at", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) saver.restore(sess, restore_model_path) else: logger.info("Creating new model") sess.run(tf.global_variables_initializer( )) # this should happen here and not in the agent right? except InvalidArgumentError as exc: if "Assign requires shapes of both tensors to match." in str(exc): print("Unable to restore model from {:s}.".format( restore_model_path)) print( "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)." ) print(exc.message) sys.exit() else: raise exc # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # restore adam optimizer try: if restore_model_name: logger.info("Restoring pkl file with adam state", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) adam_optimizer_store = pickle.load( open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store[ 'actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store[ 'actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store[ 'actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store[ 'critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store[ 'critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store[ 'critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] except: print("Unable to restore adam state from {:s}.".format( restore_model_path)) obs = env.reset() done = False episode_reward = 0. #episode_step = 0 #episodes = 0 #t = 0 #epoch_episode_steps = [] #epoch_episode_eval_rewards = [] #epoch_episode_eval_steps = [] #epoch_start_time = time.time() #epoch_actions = [] #epoch_episodes = 0 for epoch in range(nb_epochs): start_time = time.time() epoch_episode_rewards = [] epoch_qs = [] eval_episode_rewards = [] eval_qs = [] eval_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] worth_keeping = False for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape #new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, info = env.step(action) #t += 1 if rank == 0 and render: env.render() episode_reward += r #episode_step += 1 # Book-keeping. #epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) #episode_rewards_history.append(episode_reward) #epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_step = 0 #epoch_episodes += 1 #episodes += 1 agent.reset() obs = env.reset() # Train. #epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() #epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Submit to crowdai competition. What a hack. :) #if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if crowdai_client is not None and crowdai_token is not None: eval_obs_dict = crowdai_client.env_create( crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action( action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step( clipped_submit_action.tolist(), True) #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug( "done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) crowdai_client.submit() return # kids, don't try any of these (expedient hacks) at home! if eval_env: eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes( 3, eval_env, agent, nb_eval_steps, render_eval) if eval_episode_reward_mean >= eval_reward_threshold_to_keep: worth_keeping = True mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time if nb_epochs and nb_epoch_cycles and nb_train_steps > 0: #stats = agent.get_stats() #combined_stats = stats.copy() combined_stats = {} combined_stats['train/epoch_episode_reward_mean'] = np.mean( epoch_episode_rewards) #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs) combined_stats['train/epoch_loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/epoch_loss_critic'] = np.mean( epoch_critic_losses) #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['train/epoch_duration'] = duration #combined_stats['epoch/steps_per_second'] = float(t) / float(duration) #combined_stats['total/episodes'] = episodes #combined_stats['rollout/episodes'] = epoch_episodes #combined_stats['rollout/actions_std'] = np.std(epoch_actions) #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss else: combined_stats = {} # Evaluation statistics. if eval_env: combined_stats[ 'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean # np.mean(eval_episode_rewards) #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards) combined_stats[ 'eval/epoch_Q_mean'] = eval_q_mean # np.mean(eval_qs) #combined_stats['eval/episodes'] = len(eval_episode_rewards) combined_stats[ 'eval/steps_mean'] = eval_step_mean # np.mean(eval_steps) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. #combined_stats['total/epochs'] = epoch + 1 #combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.info('') logger.info('Epoch', epoch) logger.dump_tabular() logdir = logger.get_dir() if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps: logger.info( 'Saving model to', saved_model_dir + saved_model_basename + '-' + str(epoch)) saver.save(sess, saved_model_path, global_step=epoch, write_meta_graph=False) adam_optimizer_store['actor_optimizer'][ 'm'] = agent.actor_optimizer.m adam_optimizer_store['actor_optimizer'][ 'v'] = agent.actor_optimizer.v adam_optimizer_store['actor_optimizer'][ 't'] = agent.actor_optimizer.t adam_optimizer_store['critic_optimizer'][ 'm'] = agent.critic_optimizer.m adam_optimizer_store['critic_optimizer'][ 'v'] = agent.critic_optimizer.v adam_optimizer_store['critic_optimizer'][ 't'] = agent.critic_optimizer.t adam_optimizer_store['param_noise'] = agent.param_noise pickle.dump( adam_optimizer_store, open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb")) old_epoch = epoch - max_to_keep if old_epoch >= 0: try: os.remove(saved_model_path + "-" + str(old_epoch) + ".pkl") except OSError: pass if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)