def setup(self, obs_shape, nb_actions, action_spec, noise_type, gamma=1., tau=0.01, layer_norm=True): super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec, noise_type, gamma, tau, layer_norm) self.action_spec_internal = action_spec self.obs_dim = obs_shape action_noise = None param_noise = None # Parse noise_type for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.memory = Memory(limit=int(500), action_shape=(nb_actions, ), observation_shape=obs_shape) self.critic = Critic(layer_norm=layer_norm, hidden_size=128) self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128) tf.reset_default_graph() # max_action = env.action_space.high self.ddpg = DDPG(actor=self.actor, critic=self.critic, memory=self.memory, observation_shape=obs_shape, action_shape=(nb_actions, ), gamma=gamma, tau=tau, action_noise=action_noise, param_noise=param_noise)
def test(env, actor, critic, memory, normalize_observations, gamma, reward_scale, nb_episodes, episode_length, checkpoint_dir): # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, normalize_observations=normalize_observations, reward_scale=reward_scale) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high # Start testing loop with U.single_threaded_session() as sess: agent.initialize(sess) # setup saver saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2) # restore all print("restoring variables") # Add ops to save and restore all the variables. saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir)) step_times = [] for eval_episode in range(nb_episodes): print("Evaluating episode {}...".format(eval_episode)) obs = env.reset() for t in range(episode_length): # Select action a_t without noise a_t, _ = agent.pi(obs, apply_param_noise=False, apply_action_noise=False, compute_Q=False) assert a_t.shape == env.action_space.shape assert (a_t >= 0).all() # Execute action a_t and observe reward r_t and next state s_{t+1} start_step_time = time.time() obs, r_t, eval_done, info = env.step(max_action * a_t) end_step_time = time.time() step_time = end_step_time - start_step_time step_times.append(step_time) if eval_done: print(" Episode done!") obs = env.reset() break print("Average step time: ", np.mean(step_times))
def train(env, name, callback): model = deepq.models.mlp([100, 20, 20]) act = DDPG.train(env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving model to " + name + ".pkl") act.save("" + name + ".pkl")
def main(): args = parse_args() logger.configure() gamma = 0.99 tau = 0.01 normalize_returns = False normalize_observations = True batch_size = 64 action_noise = None stddev = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) critic_l2_reg = 1e-2 actor_lr = 1e-4 critic_lr = 1e-3 popart = False clip_norm = None reward_scale = 1. env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) top_model_dir = 'top-models/' # create tf sessions and graphs sess_list = [] graph_list = [] for i in range(len(args.model_files)): graph_list.append(tf.Graph()) sess_list.append(tf.Session(graph=graph_list[i])) ddpg_agents = [] for i in range(len(args.model_files)): model_name = args.model_files[i] sess = sess_list[i] graph = graph_list[i] l_size = args.layer_sizes[i] with sess.as_default(): #with U.make_session(num_cpu=1, graph=g) as sess: with graph.as_default(): #tf.global_variables_initializer() # restore agents from model files and store in ddpg_agents print("Restoring from..." + model_name) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) actor = Actor(env.action_space.shape[-1], layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # restore adam state and param noise restore_model_path = top_model_dir + model_name saver = tf.train.Saver(max_to_keep=500) # restore network weights saver.restore(sess, restore_model_path) adam_optimizer_store = pickle.load(open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] # intialize and prepare agent session. agent.initialize(sess) #sess.graph.finalize() agent.reset() ddpg_agents.append(agent) agent = BlendedAgent(ddpg_agents, sess_list, graph_list) if args.evaluation: # setup eval env eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) nb_eval_steps = 1000 # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list, # nb_eval_steps=nb_eval_steps, # render=False) reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False) print("Reward: " + str(reward)) print("Mean Q: " + str(mean_q)) print("Final num steps: " + str(final_steps)) # Submit to crowdai competition. What a hack. :) # if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if args.crowdai_submit: remote_base = "http://grader.crowdai.org:1729" crowdai_client = Client(remote_base) eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action(action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True) # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug("done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) crowdai_client.submit() for i in range(len(sess_list)): sess_list[i].close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, callback=None, pretrained='none'): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Copy an env for evaluation env_eval = copy.deepcopy(env.env) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() # load pretrained agent if possible if pretrained == 'none': logger.info('Training from scratch...') else: logger.info('Loading pretrained model from {}'.format(pretrained)) #assert os.path.exists(pretrained) saver.restore(sess, pretrained) agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 total_time = 0 start_time = time.time() total_time_record = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() #epochxposdict = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 total_time += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) total_time_record.append(total_time) #epochxposdict.append(info['pos'][0]) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: # eval for one episode eval_episode_reward = 0.0 eval_done = False eval_obs = eval_env.reset() while not eval_done: eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) eval_episode_reward += eval_r eval_qs.append(eval_q) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) """ eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. """ # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Call the callback if callback is not None: if callback(locals(), globals()): # callback returns a boolean value break # Evaluate the policy on env to record trajs eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate( env_eval, agent=agent) if callback is not None: callback.final_call(locals(), globals())
action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) tf.reset_default_graph() agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) max_iteration = 1 step_number = [] success = [] reason = {1: 0, 2: 0, 3: 0} with U.single_threaded_session() as sess:
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs): # print("kwargs:",kwargs) rank = MPI.COMM_WORLD.Get_rank() print("rank:", rank) assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. # --------------- AMEND: For saving and restoring the model. added by xlv ------------------ if kwargs['restore'] == True and kwargs['restore_path'] != None: logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(restore_path + "trained_model.meta") saver.restore(sess, tf.train.latest_checkpoint(restore_path)) else: logger.info("Starting from scratch!") sess.run(tf.global_variables_initializer()) # ---------------------------------------------------------------------------------------- agent.initialize(sess) sess.graph.finalize() agent.reset() obs = eval_obs = env.reset() # if eval_env is not None: # eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 # every 30 epochs plot statistics and save it. nb_epochs_unit = 30 ddpg_rewards = [] eval_ddpg_rewards = [] ddpg_suc_percents = [] eval_suc_percents = [] # ---- AMEND: added by xlv to calculate success percent ----- suc_num = 0 episode_num = 0 # ----------------------------------------------------------- for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, suc, info = env.step(max_action * action) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 # --- AMEND: added by xlv to calculate success percent --- episode_num += 1 if suc: suc_num += 1 # ------------------------------------------------------- agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. # eval_episode_rewards = [] # eval_qs = [] # if eval_env is not None: # eval_episode_reward = 0. # for t_rollout in range(nb_eval_steps): # eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # if render_eval: # eval_env.render() # eval_episode_reward += eval_r # # eval_qs.append(eval_q) # if eval_done: # eval_obs = eval_env.reset() # eval_episode_rewards.append(eval_episode_reward) # eval_episode_rewards_history.append(eval_episode_reward) # eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. # if eval_env is not None: # combined_stats['eval/return'] = eval_episode_rewards # combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) # combined_stats['eval/Q'] = eval_qs # combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # ------------------------------ plot statistics every nb_epochs_unit ----------------------------------- ddpg_rewards.append(np.mean(episode_rewards_history)) if (epoch + 1) % nb_epochs_unit == 0: ddpg_suc_percents.append(suc_num / episode_num) # ---------- Evaluate for 5 iters ----------------------- nb_eval_epochs = 5 nb_eval_epoch_cycles = 5 eval_episode_num = 0 eval_suc_num = 0 eval_episode_reward = 0 eval_episode_step = 0 eval_epoch_episode_rewards = [] eval_epoch_episode_steps = [] for i_epoch in range(nb_eval_epochs): logger.log( "********** Start Evaluation. Iteration %i ************" % i_epoch) for i_cycle in range(nb_eval_epoch_cycles): for t_rollout in range(nb_rollout_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) assert eval_action.shape == env.action_space.shape eval_obs, eval_r, eval_done, eval_suc, eval_info = env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_episode_reward += eval_r eval_episode_step += 1 if eval_done: eval_obs = env.reset() eval_epoch_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_epoch_episode_steps.append( eval_episode_step) eval_episode_reward = 0 eval_episode_step = 0 eval_episode_num += 1 if eval_suc: eval_suc_num += 1 logger.record_tabular( "Eval_EpRewMean", np.mean(eval_episode_rewards_history)) logger.record_tabular("Eval_EpNumUntilNow", eval_episode_num) logger.record_tabular("Eval_EpNumSuc", eval_suc_num) logger.record_tabular("Eval_EpSucPercent", eval_suc_num / eval_episode_num) logger.dump_tabular() eval_ddpg_rewards.append( np.mean(eval_episode_rewards_history)) eval_suc_percents.append(eval_suc_num / eval_episode_num) # ---------------------------------------------------------------------------------------------- # --------------------- plotting and saving ------------------------- if saver is not None: logger.info("saving the trained model") start_time_save = time.time() if epoch + 1 == nb_epochs: saver.save(sess, kwargs['MODEL_DIR'] + "/trained_model") else: saver.save( sess, kwargs['MODEL_DIR'] + "/iter_" + str( (epoch + 1) // nb_epochs_unit)) plot_performance(range(len(ddpg_rewards)), ddpg_rewards, ylabel=r'avg reward per DDPG learning step', xlabel='ddpg iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'ddpg_reward'), title='TRAIN') plot_performance( range(len(ddpg_suc_percents)), ddpg_suc_percents, ylabel= r'overall success percentage per algorithm step under DDPG', xlabel='algorithm iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'success_percent'), title="TRAIN") plot_performance(range(len(eval_ddpg_rewards)), eval_ddpg_rewards, ylabel=r'avg reward per DDPG eval step', xlabel='ddpg iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'eval_ddpg_reward'), title='EVAL') plot_performance( range(len(eval_suc_percents)), eval_suc_percents, ylabel= r'overall eval success percentage per algorithm step under DDPG', xlabel='algorithm iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'eval_success_percent'), title="EVAL") # save data which is accumulated UNTIL iter i with open( kwargs['RESULT_DIR'] + '/ddpg_reward_' + 'iter_' + str( (epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f2: pickle.dump(ddpg_rewards, f2) with open( kwargs['RESULT_DIR'] + '/success_percent_' + 'iter_' + str((epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as fs: pickle.dump(ddpg_suc_percents, fs) # save evaluation data accumulated until iter i with open( kwargs['RESULT_DIR'] + '/eval_ddpg_reward_' + 'iter_' + str((epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f_er: pickle.dump(eval_ddpg_rewards, f_er) with open( kwargs['RESULT_DIR'] + '/eval_success_percent_' + 'iter_' + str( (epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f_es: pickle.dump(eval_suc_percents, f_es)
def main(): with U.single_threaded_session() as sess: batch_size = 64 current_noise_type = 'adaptive-param_0.2' _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) param_noise_adaption_interval = 2 env = gym.make("Pendulum-v0") nb_actions = env.action_space.shape[-1] layer_norm = True # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = int(1000000 * np.random.rand()) logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, batch_size=batch_size, param_noise=param_noise) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() for t in itertools.count(): episode_rewards = [] done = False while not done: env.render() # Take action and update exploration to the newest value action, q = agent.pi(obs, apply_noise=True, compute_Q=True) new_obs, rew, done, _ = env.step(max_action * action) # Book-keeping. agent.store_transition(obs, action, rew, new_obs, done) obs = new_obs episode_rewards.append(rew) if done: agent.reset() obs = env.reset() nb_train_steps = 100 epoch_adaptive_distances = [] epoch_critic_losses = [] epoch_actor_losses = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if t % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1)) logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses))) logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses))) logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances))) logger.dump_tabular()
class DDPGAgent(BaseAgent): """A Deep Deterministic Policy Gradient implementation of an SC2 agent.""" def __init__(self): super(DDPGAgent, self).__init__() return def setup(self, obs_shape, nb_actions, action_spec, noise_type, gamma=1., tau=0.01, layer_norm=True): super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec, noise_type, gamma, tau, layer_norm) self.action_spec_internal = action_spec self.obs_dim = obs_shape action_noise = None param_noise = None # Parse noise_type for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.memory = Memory(limit=int(500), action_shape=(nb_actions, ), observation_shape=obs_shape) self.critic = Critic(layer_norm=layer_norm, hidden_size=128) self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128) tf.reset_default_graph() # max_action = env.action_space.high self.ddpg = DDPG(actor=self.actor, critic=self.critic, memory=self.memory, observation_shape=obs_shape, action_shape=(nb_actions, ), gamma=gamma, tau=tau, action_noise=action_noise, param_noise=param_noise) def step(self, obs): super(DDPGAgent, self).step(obs) acts, q = self.ddpg.pi(obs, apply_noise=True, compute_Q=True) # Move distribution from [-1, 1] to [0, 2] and convert to z-score actions_z = (2 - (acts + 1)) / 2 return actions_z, q def reset(self): super(DDPGAgent, self).reset() self.ddpg.reset() def initialize(self, sess): super(DDPGAgent, self).initialize(sess) self.ddpg.initialize(sess) def store_transition(self, obs, action, r, new_obs, done): super(DDPGAgent, self).store_transition(obs, action, r, new_obs, done) self.ddpg.store_transition(obs, action, r, new_obs, done) def train(self): super(DDPGAgent, self).train() return self.ddpg.train() def adapt_param_noise(self): super(DDPGAgent, self).adapt_param_noise() return self.ddpg.adapt_param_noise() def backprop(self): super(DDPGAgent, self).backprop() self.ddpg.update_target_net() def get_memory_size(self): super(DDPGAgent, self).get_memory_size() return self.memory.nb_entries @property def action_spec(self): return self.action_spec_internal @property def obs_shape(self): return self.obs_dim
def run(self): """Override Process.run()""" # Create environment env = create_environment( action_repeat=self.action_repeat, full=self.full, exclude_centering_frame=self.exclude_centering_frame, visualize=self.visualize, fail_reward=self.fail_reward, integrator_accuracy=self.integrator_accuracy) nb_actions = env.action_space.shape[-1] # keep tracks of the number of trajectory done num_traj = 0 env.seed(os.getpid()) set_global_seeds(os.getpid()) # Create OU Noise action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=0.2, theta=0.1) # Allocate ReplayBuffer memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Create DPPG agent agent = DDPG(self.actor, self.critic, memory, env.observation_space.shape, env.action_space.shape, gamma=self.gamma, tau=self.tau, normalize_returns=self.normalize_returns, normalize_observations=self.normalize_observations, batch_size=self.batch_size, action_noise=action_noise, param_noise=self.param_noise, critic_l2_reg=self.critic_l2_reg, enable_popart=self.popart, clip_norm=self.clip_norm, reward_scale=self.reward_scale) # Build the sampling logic fn sampling_fn = make_sampling_fn(agent, env, self.episode_length, self.action_repeat, self.max_action, self.nb_episodes, self.action_noise_prob) # Start TF session with U.single_threaded_session() as sess: agent.initialize(sess) set_parameters = U.SetFromFlat(self.actor.trainable_vars) # Start sampling-worker loop. while True: # self.event.wait() # Wait for a new message # self.event.clear() # Upon message receipt, mark as read message, actor_ws = self.inputQ.get() # Pop message if message == 'sample': # Set weights set_parameters(actor_ws) # Do sampling transitions = sampling_fn() self.outputQ.put((self.process_index, transitions)) # update number of trajectories num_traj += self.nb_episodes # restore environment if needed if num_traj >= self.max_env_traj: env.restore() num_traj = 0 elif message == 'exit': print('[Worker {}] Exiting...'.format(os.getpid())) env.close() break
import numpy as np import tensorflow as tf from mpi4py import MPI from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from osim.env import ProstheticsEnv import sys # Settings remote_base = "http://grader.crowdai.org:1729" crowdai_token = "8592db9b224e4293d437776321861a32" client = Client(remote_base) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) memory=[] agent = DDPG(actor, critic, env.observation_space.shape, env.action_space.shape) # Create environment observation = client.env_create(crowdai_token) # IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) def my_controller(): with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() filename="/home/vaisakhs_shaj/Desktop/MODEL/MODEL/tfSteps"+str(80000)+".model" saver.restore(sess,filename)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def __init__( self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, # ddpg related params layer_norm=False, tau=0.001, normalize_returns=False, normalize_observations=True, batch_size=128, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, popart=False, clip_norm=10., reward_scale=1.): sess = tf.get_default_session() act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) # init DDPG critic = Critic(layer_norm=layer_norm) actor = Actor(ac_space.shape[-1], layer_norm=layer_norm) memory = Memory(limit=int(1e6), action_shape=ac_space.shape, observation_shape=ob_space.shape) ddpg_agent = DDPG(actor, critic, memory, ob_space.shape, ac_space.shape, gamma=0.99, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=None, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) ddpg_agent.initialize(sess) ddpg_agent.reset() A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) if use_annealing: DDPG_AC = tf.placeholder(tf.float32, (None, ) + ac_space.shape) DDPG_W = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) if use_annealing: pi_mean = train_model.pi ac_loss = tf.reduce_mean(tf.square(pi_mean - DDPG_AC)) # loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # ----------------- DDPG ----------------- if use_ddpg: loss = pg_loss - entropy * ent_coef if use_annealing: loss = pg_loss - entropy * ent_coef + ac_loss * DDPG_W else: loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # ----------------- DDPG ----------------- with tf.variable_scope('model'): params = tf.trainable_variables() grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None, ddpg_acs=None, ddpg_w=0.): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) if not use_annealing: td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } else: td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, DDPG_AC: ddpg_acs, DDPG_W: ddpg_w } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if not use_annealing: return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] else: return sess.run([ pg_loss, vf_loss, entropy, approxkl, clipfrac, ac_loss, _train ], td_map)[:-1] if not use_annealing: self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] else: self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'ac_loss' ] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.agent = ddpg_agent self.save = save self.load = load tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from osim.env import ProstheticsEnv import sys env = ProstheticsEnv(visualize=False) env.change_model(model='3D', difficulty=2, prosthetic=True) layer_norm = True nb_actions = 19 memory = Memory(limit=int(1.5e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=0.99) saver = tf.train.Saver() # IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) sess = tf.InteractiveSession() agent.initialize(sess) sess.graph.finalize() agent.reset() filename = "/home/vaisakhs_shaj/Desktop/MODEL/tfSteps" + str(10000) + ".model" saver.restore(sess, filename) observation = env.reset()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, saved_model_basename, restore_model_name, crowdai_client, crowdai_token, reward_shaping, feature_embellishment, relative_x_pos, relative_z_pos, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saved_model_dir = 'saved-models/' if saved_model_basename is None: saved_model_basename = ''.join( random.choices(string.ascii_lowercase + string.digits, k=8)) saved_model_path = saved_model_dir + saved_model_basename if restore_model_name: restore_model_path = restore_model_name if not pathlib.Path(restore_model_path + '.index').is_file(): restore_model_path = saved_model_dir + restore_model_name max_to_keep = 500 eval_reward_threshold_to_keep = 300 saver = tf.train.Saver(max_to_keep=max_to_keep) adam_optimizer_store = dict() adam_optimizer_store['actor_optimizer'] = dict() adam_optimizer_store['critic_optimizer'] = dict() #eval_episode_rewards_history = deque(maxlen=100) #episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: try: if restore_model_name: logger.info("Restoring from model at", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) saver.restore(sess, restore_model_path) else: logger.info("Creating new model") sess.run(tf.global_variables_initializer( )) # this should happen here and not in the agent right? except InvalidArgumentError as exc: if "Assign requires shapes of both tensors to match." in str(exc): print("Unable to restore model from {:s}.".format( restore_model_path)) print( "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)." ) print(exc.message) sys.exit() else: raise exc # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # restore adam optimizer try: if restore_model_name: logger.info("Restoring pkl file with adam state", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) adam_optimizer_store = pickle.load( open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store[ 'actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store[ 'actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store[ 'actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store[ 'critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store[ 'critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store[ 'critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] except: print("Unable to restore adam state from {:s}.".format( restore_model_path)) obs = env.reset() done = False episode_reward = 0. #episode_step = 0 #episodes = 0 #t = 0 #epoch_episode_steps = [] #epoch_episode_eval_rewards = [] #epoch_episode_eval_steps = [] #epoch_start_time = time.time() #epoch_actions = [] #epoch_episodes = 0 for epoch in range(nb_epochs): start_time = time.time() epoch_episode_rewards = [] epoch_qs = [] eval_episode_rewards = [] eval_qs = [] eval_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] worth_keeping = False for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape #new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, info = env.step(action) #t += 1 if rank == 0 and render: env.render() episode_reward += r #episode_step += 1 # Book-keeping. #epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) #episode_rewards_history.append(episode_reward) #epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_step = 0 #epoch_episodes += 1 #episodes += 1 agent.reset() obs = env.reset() # Train. #epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() #epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Submit to crowdai competition. What a hack. :) #if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if crowdai_client is not None and crowdai_token is not None: eval_obs_dict = crowdai_client.env_create( crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action( action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step( clipped_submit_action.tolist(), True) #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug( "done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) crowdai_client.submit() return # kids, don't try any of these (expedient hacks) at home! if eval_env: eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes( 3, eval_env, agent, nb_eval_steps, render_eval) if eval_episode_reward_mean >= eval_reward_threshold_to_keep: worth_keeping = True mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time if nb_epochs and nb_epoch_cycles and nb_train_steps > 0: #stats = agent.get_stats() #combined_stats = stats.copy() combined_stats = {} combined_stats['train/epoch_episode_reward_mean'] = np.mean( epoch_episode_rewards) #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs) combined_stats['train/epoch_loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/epoch_loss_critic'] = np.mean( epoch_critic_losses) #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['train/epoch_duration'] = duration #combined_stats['epoch/steps_per_second'] = float(t) / float(duration) #combined_stats['total/episodes'] = episodes #combined_stats['rollout/episodes'] = epoch_episodes #combined_stats['rollout/actions_std'] = np.std(epoch_actions) #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss else: combined_stats = {} # Evaluation statistics. if eval_env: combined_stats[ 'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean # np.mean(eval_episode_rewards) #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards) combined_stats[ 'eval/epoch_Q_mean'] = eval_q_mean # np.mean(eval_qs) #combined_stats['eval/episodes'] = len(eval_episode_rewards) combined_stats[ 'eval/steps_mean'] = eval_step_mean # np.mean(eval_steps) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. #combined_stats['total/epochs'] = epoch + 1 #combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.info('') logger.info('Epoch', epoch) logger.dump_tabular() logdir = logger.get_dir() if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps: logger.info( 'Saving model to', saved_model_dir + saved_model_basename + '-' + str(epoch)) saver.save(sess, saved_model_path, global_step=epoch, write_meta_graph=False) adam_optimizer_store['actor_optimizer'][ 'm'] = agent.actor_optimizer.m adam_optimizer_store['actor_optimizer'][ 'v'] = agent.actor_optimizer.v adam_optimizer_store['actor_optimizer'][ 't'] = agent.actor_optimizer.t adam_optimizer_store['critic_optimizer'][ 'm'] = agent.critic_optimizer.m adam_optimizer_store['critic_optimizer'][ 'v'] = agent.critic_optimizer.v adam_optimizer_store['critic_optimizer'][ 't'] = agent.critic_optimizer.t adam_optimizer_store['param_noise'] = agent.param_noise pickle.dump( adam_optimizer_store, open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb")) old_epoch = epoch - max_to_keep if old_epoch >= 0: try: os.remove(saved_model_path + "-" + str(old_epoch) + ".pkl") except OSError: pass if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, teacher, tau=0.01, eval_env=True, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() t = datetime.now().strftime('%H-%M') PATH = 'results/ddpg'.format(t) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 agent.restore_model(PATH) for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( action) eval_env.background = get_q_background( eval_env, agent.q, eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t agent.save_model(PATH, epoch) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train_return(env, param_noise, actor, critic, memory, nb_epochs=250, nb_epoch_cycles=20, reward_scale=1., render=False, normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, action_noise=None, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, nb_rollout_steps=2048, batch_size=64, tau=0.01, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # Set up logging stuff only for a single worker. episode_rewards_history = deque(maxlen=100) #with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print('epoch number:', epoch) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() return agent
def train(env, nb_epochs, nb_episodes, nb_epoch_cycles, episode_length, nb_train_steps, eval_freq, save_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, action_noise, param_noise, popart, clip_norm, batch_size, reward_scale, action_repeat, full, exclude_centering_frame, visualize, fail_reward, num_processes, num_processes_to_wait, num_testing_processes, learning_session, min_buffer_length, integrator_accuracy=5e-5, max_env_traj=100, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ assert action_repeat > 0 assert nb_episodes >= num_processes # Get params from learning session checkpoint_dir = learning_session.checkpoint_dir log_dir = learning_session.log_dir training_step = learning_session.last_training_step # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, training_step=training_step) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high # Build Workers events = [Event() for _ in range(num_processes)] inputQs = [Queue() for _ in range(num_processes)] outputQ = Queue() # Split work among workers nb_episodes_per_worker = nb_episodes // num_processes workers = [ SamplingWorker(i, actor, critic, episode_length, nb_episodes_per_worker, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, param_noise, critic_l2_reg, popart, clip_norm, reward_scale, events[i], inputQs[i], outputQ, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) for i in range(num_processes) ] # Run the Workers for w in workers: w.start() # Create Round Robin tester tester = RoundRobinTester( num_testing_processes, actor, critic, episode_length, nb_eval_episodes, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) # Start training loop with U.single_threaded_session() as sess: agent.initialize(sess) writer = tf.summary.FileWriter(log_dir) writer.add_graph(sess.graph) # Initialize writer and statistics stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) # setup saver saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2) get_parameters = U.GetFlat(actor.trainable_vars) global_step = 0 obs = env.reset() agent.reset() # Processes waiting for a new sampling task waiting_indices = [i for i in range(num_processes)] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # If we have sampling workers waiting, dispatch a sampling job if waiting_indices: actor_ws = get_parameters() # Run parallel sampling for i in waiting_indices: inputQs[i].put(('sample', actor_ws)) events[i].set() # Notify worker: sample baby, sample! waiting_indices.clear() # Collect results when ready for i in range(num_processes_to_wait): process_index, transitions = outputQ.get() waiting_indices.append(process_index) print('Collecting transition samples from Worker {}/{}'. format(i + 1, num_processes_to_wait)) for t in transitions: agent.store_transition(*t) # try to collect other samples if available for i in range(num_processes): try: process_index, transitions = outputQ.get_nowait() if process_index not in waiting_indices: waiting_indices.append(process_index) print('Collecting transition samples from Worker {}'. format(process_index)) for t in transitions: agent.store_transition(*t) except queue.Empty: # No sampling ready, keep on training. pass # Training phase if agent.memory.nb_entries > min_buffer_length: for _ in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if cycle % eval_freq == 0: print("Cycle number: ", cycle + epoch * nb_epoch_cycles) print("Sending testing job...") actor_ws = get_parameters() # Send a testing job tester.test(actor_ws, global_step) # Print stats (if any) tester.log_stats(stats, logger) if cycle % save_freq == 0: # Save weights save_path = saver.save(sess, checkpoint_dir, global_step=global_step) print("Model saved in path: %s" % save_path) # Dump learning session learning_session.dump(agent.training_step) print("Learning session dumped to: %s" % str(learning_session.session_path)) else: print("Not enough entry in memory buffer") # Stop workers for i in range(num_processes): inputQs[i].put(('exit', None)) events[i].set() # Notify worker: exit! tester.close() # Stop testing workers env.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() #print(np.abs(env.action_space.low)) #print(np.abs(env.action_space.high)) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high print(env.action_space) print(env.observation_space) #logger.info('scaling actions by {} before executing in env'.format(max_action)) if load_memory: memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb")) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) ''' # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None ''' saver=tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=10) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if restore: filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(120000)+".model" saver.restore(sess,filename) obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() tr=0 s=0 while True: action=agent.pi(obs, apply_noise=False, compute_Q=False)[0] obs, r, done, info = env.step(action) tr=tr+r s=s+1 print(r) if done: print(tr) obs=env.reset() tr=0 print(s) break
def evaluate(env, nb_episodes, reward_scale, render, param_noise, action_noise, actor, critic, memory, critic_l2_reg, normalize_returns=False, normalize_observations=True, weight_file=None): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, normalize_returns=normalize_returns, normalize_observations=normalize_observations, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) with U.single_threaded_session() as sess: agent.initialize(sess) if weight_file: saver = tf.train.Saver(actor.trainable_vars + critic.trainable_vars) saver.restore(sess, weight_file) agent.actor_optimizer.sync() agent.critic_optimizer.sync() # sess.graph.finalize() agent.reset() obs = env.reset() total_reward = 0.0 max_steps = 2000 for ep in range(nb_episodes): i = 0 done = False episode_reward = 0.0 while not done and i < max_steps: action, q, all_actions, sample = agent.pi(obs, apply_noise=False, compute_Q=True) assert action.shape == env.action_space.shape assert max_action.shape == action.shape obs, r, done, info = env.step(max_action * action) episode_reward += r # env.render() # print('Action:{}, reward:{}'.format(action, r)) # time.sleep(0.1) i += 1 total_reward += episode_reward logger.info("Episode:{}, reward:{}, steps:{}".format( ep, episode_reward, i)) if done: obs = env.reset() logger.info("Average reward:{}, total reward:{}, episodes:{}".format( (total_reward / nb_episodes), total_reward, nb_episodes))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): """ Runs the training of the Deep Deterministic Policy Gradien (DDPG) model DDPG: https://arxiv.org/pdf/1509.02971.pdf :param env: (Gym Environment) the environment :param nb_epochs: (int) the number of training epochs :param nb_epoch_cycles: (int) the number cycles within each epoch :param render_eval: (bool) enable rendering of the evalution environment :param reward_scale: (float) the value the reward should be scaled by :param render: (bool) enable rendering of the environment :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) :param actor: (TensorFlow Tensor) the actor model :param critic: (TensorFlow Tensor) the critic model :param normalize_returns: (bool) should the critic output be normalized :param normalize_observations: (bool) should the observation be normalized :param critic_l2_reg: (float) l2 regularizer coefficient :param actor_lr: (float) the actor learning rate :param critic_lr: (float) the critic learning rate :param action_noise: (ActionNoise) the action noise type (can be None) :param popart: (bool) enable pop-art normalization of the critic output (https://arxiv.org/pdf/1602.07714.pdf) :param gamma: (float) the discount rate :param clip_norm: (float) clip the gradients (disabled if None) :param nb_train_steps: (int) the number of training steps :param nb_rollout_steps: (int) the number of rollout steps :param nb_eval_steps: (int) the number of evalutation steps :param batch_size: (int) the size of the batch for learning the policy :param memory: (Memory) the replay buffer :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) :param eval_env: (Gym Environment) the evaluation environment (can be None) :param param_noise_adaption_interval: (int) apply param noise every N steps """ rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, param_noise=param_noise, action_noise=action_noise, gamma=gamma, tau=tau, normalize_returns=normalize_returns, enable_popart=popart, normalize_observations=normalize_observations, batch_size=batch_size, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: tf.train.Saver() eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with tf_util.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for _ in range(nb_epoch_cycles): # Perform rollouts. for _ in range(nb_rollout_steps): # Predict next action. action, q_value = agent.policy(obs, apply_noise=True, compute_q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, reward, done, _ = env.step(max_action * action) step += 1 if rank == 0 and render: env.render() episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q_value) agent.store_transition(obs, action, reward, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) critic_loss, actor_loss = agent.train() epoch_critic_losses.append(critic_loss) epoch_actor_losses.append(actor_loss) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for _ in range(nb_eval_steps): eval_action, eval_q = agent.policy(eval_obs, apply_noise=False, compute_q=True) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, _ = eval_env.step( max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(step) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: pickle.dump(env.get_state(), file_handler) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: pickle.dump(eval_env.get_state(), file_handler)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, perform=False, expert=None, save_networks=False, supervise=False, pre_epoch=60, actor_only=False, critic_only=False, both_ours_sup=False, gail=False, pofd=False): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, expert=expert, save_networks=save_networks, supervise=supervise, actor_only=actor_only, critic_only=critic_only, both_ours_sup=both_ours_sup, gail=gail, pofd=pofd) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. network_saving_dir = os.path.join('./saved_networks', env.env.spec.id) + '/' if not os.path.exists(network_saving_dir): os.makedirs(network_saving_dir) agent.initialize(sess, saver, network_saving_dir, 10000, 30000) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() if expert is None: pretrain = False else: pretrain = True done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 small_buffer = [] big_buffer = [] for epoch in range(nb_epochs): if epoch >= pre_epoch and pretrain: pretrain = False logger.info('Stoped pretrain at epoch {}'.format(epoch)) for cycle in range(nb_epoch_cycles): if not perform: # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train(pretrain) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): old_eval_obs = eval_obs eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if perform: small_buffer.append([ old_eval_obs, eval_action, eval_r, eval_obs, eval_done ]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. if perform and len(small_buffer) > 0: big_buffer.append(small_buffer) small_buffer = [] if len(big_buffer ) > 0 and len(big_buffer) % 1000 == 0: expert_dir = os.path.join( './expert', env.env.spec.id) + '/' if not os.path.exists(expert_dir): os.makedirs(expert_dir) pwritefile = open( os.path.join(expert_dir, 'expert.pkl'), 'wb') pickle.dump(big_buffer, pwritefile, -1) pwritefile.close() logger.info('Expert data saved!') return # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time combined_stats = {} if not perform: stats = agent.get_stats() for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. if not perform: combined_stats['rollout/return'] = mpi_mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean( epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean( epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) if not perform: # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
cm_pos = [state_desc["misc"]["mass_center_pos"][i] - pelvis[i] for i in range(2)] res = res + cm_pos + state_desc["misc"]["mass_center_vel"] + state_desc["misc"]["mass_center_acc"] return res # Settings remote_base = "http://grader.crowdai.org:1729" crowdai_token = "01342e360022c2def5c2cc04c5843381" Client = Client(remote_base) layer_norm=True nb_actions=19 memory = Memory(limit=int(1.5e6), action_shape=(158,), observation_shape=(19,)) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) agent = DDPG(actor, critic, memory, (158,), (19,), gamma=0.99) saver=tf.train.Saver() # IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) sess=tf.InteractiveSession() agent.initialize(sess) sess.graph.finalize() agent.reset() filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(30000)+".model" saver.restore(sess,filename) # Create environment observation = Client.env_create(env_id="ProstheticsEnv",token=crowdai_token) #print([n.name for n in tf.get_default_graph().as_graph_def().node])
def train(env, nb_epochs, nb_episodes, episode_length, nb_train_steps, eval_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, clip_norm, batch_size, reward_scale, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high with U.single_threaded_session() as sess: agent.initialize(sess) # Setup summary writer writer = _setup_tf_summary() writer.add_graph(sess.graph) stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) sess.graph.finalize() global_step = 0 obs = env.reset() agent.reset() for epoch in range(nb_epochs): for episode in range(nb_episodes): obs = env.reset() # Generate a trajectory for t in range(episode_length): # Select action a_t according to current policy and # exploration noise a_t, _ = agent.pi(obs, apply_noise=True, compute_Q=False) assert a_t.shape == env.action_space.shape # Execute action a_t and observe reward r_t and next state s_{t+1} new_obs, r_t, done, info = env.step(max_action * a_t) # Store transition in the replay buffer agent.store_transition(obs, a_t, r_t, new_obs, done) obs = new_obs if done: agent.reset() obs = env.reset() break # End episode # Training phase for t_train in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if episode % eval_freq == 0: # Generate evaluation trajectories for eval_episode in range(nb_eval_episodes): obs = env.reset() for t in range(episode_length): env.render() # Select action a_t according to current policy and # exploration noise a_t, _ = agent.pi(obs, apply_noise=False, compute_Q=False) assert a_t.shape == env.action_space.shape # Execute action a_t and observe reward r_t and next state s_{t+1} obs, r_t, eval_done, info = env.step(max_action * a_t) stats.add_reward(r_t) if eval_done: obs = env.reset() break # Plot average reward stats.plot_reward(global_step)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, overwrite_memory, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, logdir, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, eval_jump, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, agentName=None, resume=0, max_to_keep=100): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver(max_to_keep=max_to_keep) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) logF = open(os.path.join(logdir, 'log.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') logReward = open(os.path.join(logdir, 'logReward.txt'), 'a') with U.single_threaded_session() as sess: # Prepare everything. if (resume == 0): agent.initialize(sess, max_to_keep=max_to_keep) else: #restore = "{}-{}".format(agentName,resume) agent.initialize(sess, path=os.path.abspath(logdir), restore=agentName, itr=resume, overwrite=overwrite_memory, max_to_keep=max_to_keep) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(resume, resume + nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. print("Epoch " + str(epoch) + " episodes " + str(episodes) + " steps " + str(episode_step) + " reward " + str(episode_reward)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None and epoch % eval_jump == 0: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: print("Eval reward " + str(eval_episode_reward)) eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None and epoch % eval_jump == 0: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') # logdir = logger.get_dir() if rank == 0: logReward.write( str(epoch) + "," + str(combined_stats["rollout/return"]) + "\n") logReward.flush() logF.write(str(combined_stats["rollout/return"]) + "\n") json.dump(combined_stats, logStats) logF.flush() logStats.flush() # if not os.path.exists(os.path.abspath(logdir)): # os.makedirs(os.path.abspath(logdir), exist_ok=True) # print("logdir = ", logdir) # with open(os.path.join(logdir, "{}_{}".format(agentName, agent.itr.eval())), 'wb') as f: # pickle.dump(agent, f) agent.save(path=logdir, name=agentName, overwrite=overwrite_memory) logger.info("agent {} saved".format(agent.itr.eval())) if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, save_path=None, restore_path=None, hindsight_mode=None): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. transitions = [] for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) transitions.append((obs, action, r, new_obs, done)) #agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # store regular transitions into replay memory for (obs, action, r, new_obs, done) in transitions: agent.store_transition(obs, action, r, new_obs, done) if hindsight_mode in ['final', 'future']: for (obs, action, r, new_obs, done) in replay_final(transitions, env.env): agent.store_transition(obs, action, r, new_obs, done) if hindsight_mode in ['future']: for (obs, action, r, new_obs, done) in replay_future(transitions, env.env): agent.store_transition(obs, action, r, new_obs, done) # store hindsight transitions. '''for i in range(3): # sample a random point in the trajectory idx = np.random.randint(0, len(transitions)) obs, action, r, new_obs, done = transitions[idx] # create a goal from that point goal = env.env.obs_to_goal(new_obs) for (obs, action, r, new_obs, done) in replay_with_goal(transitions[:idx+1], goal, env.env): agent.store_transition(obs, action, r, new_obs, done) obs, action, r, new_obs, done = transitions[-1] # store a "final" transition. goal = env.env.obs_to_goal(new_obs) for (obs, action, r, new_obs, done) in replay_with_goal(transitions, goal, env.env): agent.store_transition(obs, action, r, new_obs, done)''' # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['reward'] = mpi_mean(epoch_episode_rewards) # combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history)) combined_stats['episode_steps'] = mpi_mean(epoch_episode_steps) combined_stats['episodes'] = mpi_sum(epoch_episodes) # combined_stats['actions_mean'] = mpi_mean(epoch_actions) combined_stats['actions_std'] = mpi_std(epoch_actions) combined_stats['Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['policy_loss'] = mpi_mean(epoch_actor_losses) combined_stats['value_loss'] = mpi_mean(epoch_critic_losses) combined_stats['param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/reward'] = mpi_mean(eval_episode_rewards) # combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history)) combined_stats['eval/Q_mean'] = mpi_mean(eval_qs) # combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards)) # Total statistics. # combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) # combined_stats['total/episodes'] = mpi_mean(episodes) # combined_stats['total/epochs'] = epoch + 1 # combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, restore=True): rank = MPI.COMM_WORLD.Get_rank() # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, (env.action_space.shape[0], ), gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, observation_range=(env.observation_space.low[0], env.observation_space.high[0]), action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up saving stuff only for a single worker. savingModelPath = "/home/joel/Documents/saved_models_OpenAI_gym/" if rank == 0: saver = tf.train.Saver(keep_checkpoint_every_n_hours=1) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. # from https://github.com/openai/baselines/issues/162#issuecomment-397356482 and # https://www.tensorflow.org/api_docs/python/tf/train/import_meta_graph if restore == True: # restoring doesn't actually work logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(savingModelPath + "ddpg_test_model.meta") saver.restore(sess, tf.train.latest_checkpoint(savingModelPath)) else: logger.info("Starting from scratch!") sess.run(tf.global_variables_initializer() ) # this should happen here and not in the agent right? agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 t_rollout = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): start_time_epoch = time.time() for cycle in range(nb_epoch_cycles): start_time_cycle = time.time() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # while(not done): start_time_rollout = time.time() # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) logging.debug("q-value of selected action: {}".format(q)) # np.set_printoptions(precision=3) logging.debug( "selected (unscaled) action: " + str(action)) # e.g. [ 0.04 -0.662 -0.538 0.324] # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) target = scale_range(action, -1, 1, env.action_space.low, env.action_space.high) # Execute next action. if rank == 0 and render: env.render() assert target.shape == env.action_space.shape new_obs, r, done, info = env.step(target) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done or t_rollout >= nb_rollout_steps - 1: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # t_rollout += 1 logger.info( 'runtime rollout-step {0}.{1}.{2}: {3}s'.format( epoch, cycle, t_rollout, time.time() - start_time_rollout)) # for rollout_steps # Train. logging.info("Training the Agent") start_time_train = time.time() epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # 50 iterations # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise( ) # e.g. 0.7446093559265137 epoch_adaptive_distances.append(distance) cl, al = agent.train() logging.debug( "critic loss: {}".format(cl)) # e.g. 25.988863 logging.debug( "actor loss: {}".format(al)) # e.g. -0.008966461 epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. logger.info('runtime training actor & critic: {}s'.format( time.time() - start_time_train)) # Saving the trained model if (saver is not None): logger.info("saving the trained model") start_time_save = time.time() saver.save(sess, savingModelPath + "ddpg_test_model") logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) done = False logger.info('runtime epoch-cycle {0}: {1}s'.format( cycle, time.time() - start_time_cycle)) # for epoch_cycles mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logging.info("\t{0} : {1}".format(key, combined_stats[key])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Saving the trained model if (saver is not None): logger.info("saving the trained model") start_time_save = time.time() saver.save(sess, savingModelPath + "ddpg_model_epochSave", global_step=epoch) logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) logger.info('runtime epoch {0}: {1}s'.format( epoch, time.time() - start_time_epoch))
def run(self): """Override Process.run()""" # Create environment env = create_environment( action_repeat=self.action_repeat, full=self.full, exclude_centering_frame=self.exclude_centering_frame, visualize=self.visualize, fail_reward=self.fail_reward, integrator_accuracy=self.integrator_accuracy) nb_actions = env.action_space.shape[-1] env.seed(os.getpid()) set_global_seeds(os.getpid()) num_traj = 0 # Allocate ReplayBuffer memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Create DPPG agent agent = DDPG(self.actor, self.critic, memory, env.observation_space.shape, env.action_space.shape, gamma=self.gamma, tau=self.tau, normalize_returns=self.normalize_returns, normalize_observations=self.normalize_observations, batch_size=self.batch_size, action_noise=None, param_noise=None, critic_l2_reg=self.critic_l2_reg, enable_popart=self.popart, clip_norm=self.clip_norm, reward_scale=self.reward_scale) # Build the testing logic fn testing_fn = make_testing_fn(agent, env, self.episode_length, self.action_repeat, self.max_action, self.nb_episodes) # Start TF session with U.single_threaded_session() as sess: agent.initialize(sess) set_parameters = U.SetFromFlat(self.actor.trainable_vars) # Start sampling-worker loop. while True: message, actor_ws, global_step = self.inputQ.get( ) # Pop message if message == 'test': # Set weights set_parameters(actor_ws) # Do testing rewards, step_times, distances, episode_lengths = testing_fn( ) self.outputQ.put((rewards, step_times, distances, episode_lengths, global_step)) # update number of trajectories num_traj += self.nb_episodes # restore environment if needed if num_traj >= self.max_env_traj: env.restore() num_traj = 0 elif message == 'exit': print('[Worker {}] Exiting...'.format(os.getpid())) env.close() break
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, load_network_id, latest, plot_info, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if (load_network_id): agent.load_actor_critic(id=load_network_id) if (latest): agent.load_actor_critic(latest=True) obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_distances2target = [] epoch_episode_relative_alt = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: logger.info('EPISODE OVER!') # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_distances2target.append( info['dist2target']) epoch_episode_relative_alt.append(info['relative_alt']) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 if (episodes % 10 == 0): agent.save_actor_critic(id=episodes) if (episodes % 2 == 0 and plot_info): plot_information(epoch_episode_distances2target, epoch_episode_rewards, epoch_episode_relative_alt) plt.pause(0.1) agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Update learning rates if (epoch % 5 == 0 and epoch > 0): agent.update_lr(agent.actor_lr * 0.65, agent.critic_lr * 0.65) logger.info('Finished training')
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, save_model, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() if not os.path.exists(os.path.join(logger.get_dir(), 'model')): os.makedirs(os.path.join(logger.get_dir(), 'model')) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.make_session( num_cpu=4) as sess: # U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def __init__(self, env, agent_index, sess, action_range=(-1., 1.), reward_scale=0.1, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.975, clip_norm=10, batch_size=64, memory_size=1e6, tau=0.01, normalize_returns=False, normalize_observations=False, noise_type="adaptive-param_0.1", layer_norm=True, nb_layers=2, nb_neurons=64, activation='tanh', **network_kwargs): super(DDPGAgent, self).__init__(agent_index) # self.sess = sess self.nb_actions = env.action_space[agent_index].n print('agent action_space ' + str(env.action_space[agent_index].n)) self.state_size = env.observation_space[agent_index].shape self.action_range = action_range with tf.variable_scope('ddpg_' + str(agent_index)): critic = Critic(name='critic_' + str(agent_index), layer_norm=layer_norm, nb_layers=nb_layers, nb_neurons=nb_neurons) actor = Actor(self.nb_actions, name='actor_' + str(agent_index), layer_norm=layer_norm, nb_neurons=nb_neurons, activation=activation) memory = Memory(limit=int(memory_size), action_shape=(self.nb_actions, ), observation_shape=self.state_size) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions), dt=env.world.dt, theta=0.1) else: raise RuntimeError('unknown noise type "{}"'.format( current_noise_type)) self.agent = DDPG(actor, critic, memory, self.state_size, (self.nb_actions, ), action_range=self.action_range, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(self.agent.__dict__.items())) self.agent.initialize(sess) self.agent.reset()