def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, gamma_reward_shaping=0.1, start_reward_shaping=10000): logger.info(sys._getframe().f_code.co_name) rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() max_action = env.action_space.high logger.info("scale actions by {} before executing in env".format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info("Using agent with the following configuration:") logger.info(str(agent.__dict__.items())) if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 episode_sample = [] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): for t_rollout in range(nb_eval_steps): action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 epoch_actions.append(action) epoch_qs.append(q) episode_sample.append((obs, action, r, new_obs, done)) if t <= start_reward_shaping: agent.store_transition(obs, action, r, new_obs, done) if done: if t > start_reward_shaping: logger.info("start reward shaping") reward = r agent.store_transition(obs, action, reward, new_obs, done) # episode_sample.append() for i in range(len(episode_sample) - 1): obs_tmp, action_tmp, rew_tmp, new_obs_tmp, done_tmp = \ episode_sample[len(episode_sample) - i - 1] reward = round(reward * gamma_reward_shaping, 5) reward = reward + rew_tmp agent.store_transition(obs_tmp, action_tmp, reward, new_obs_tmp, done) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() obs = new_obs epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats["rollout/return"] = np.mean(epoch_episode_rewards) combined_stats["rollout/return_history"] = np.mean(episode_rewards_history) combined_stats["rollout/episode_steps"] = np.mean(epoch_episode_steps) combined_stats["rollout/actions_mean"] = np.mean(epoch_actions) combined_stats["rollout/Q_mean"] = np.mean(epoch_qs) combined_stats["train/loss_actor"] = np.mean(epoch_actor_losses) combined_stats["train/loss_critic"] = np.mean(epoch_critic_losses) combined_stats["train/param_noise_distance"] = np.mean(epoch_adaptive_distances) combined_stats["total/duration"] = duration combined_stats["total/steps_per_second"] = float(t) / float(duration) combined_stats["total/episodes"] = episodes combined_stats["rollout/episodes"] = epoch_episodes combined_stats["rollour/actions_std"] = np.std(epoch_actions) if eval_env is not None: combined_stats["eval/return"] = eval_episode_rewards combined_stats["eval/return_history"] = np.mean(eval_episode_rewards_history) combined_stats["eval/Q"] = eval_qs combined_stats["eval/episodes"] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError("expected scalar, got %s" % x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} combined_stats["total/epochs"] = epoch + 1 combined_stats["total/steps"] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info("") logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, "get_state"): with open(os.path.join(logdir, "env_state.pkl"), "wb") as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, "get_state"): with open(os.path.join(logdir, "eval_env_state.pkl"), "wb") as f: pickle.dump(eval_env.get_state(), f)
def Test(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, restore=False): rank = MPI.COMM_WORLD.Get_rank() max_action = np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2]) logger.info( 'scaling actions by {} before executing in env'.format(max_action)) model_directory = '/home/rvsa/RL_project/Peg_in_Hole/1-baselines/baselines/ddpg/result/' agent = DDPG(actor, critic, memory, env.state_dim, env.action_dim, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=None, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) """Set up logging stuff only for a single worker""" saver = tf.train.Saver() # if rank == 0: # saver = tf.train.Saver() # else: # saver = None # eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: """Prepare everything""" if restore: saver = tf.train.import_meta_graph(model_directory + 'model_fuzzy_new_3.meta') agent.restore_model(model_directory, saver, sess) else: agent.initialize(sess) sess.graph.finalize() """Agent Reset""" agent.reset() """Force calibration""" # env.robot_control.CalibFCforce() learning_epochs = 15 delay_rate = np.power(10, 1 / learning_epochs) """Revise the last epochs""" # last_epochs = 0 # actor_lr = actor_lr/np.power(delay_rate, last_epochs) # critic_lr = critic_lr/np.power(delay_rate, last_epochs) start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] mean_rollout_time = [] mean_epoch_rewards = [] mean_epoch_steps = [] mean_epoch_time = [] epoch_adaptive_distances = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 total_episodes = 0 successful_rate = [] Force_moments = np.zeros((1, 6)) for epoch in range(nb_epochs): """Show the result for cycle 20 times and Save the model""" # epoch_actor_losses = [] # epoch_critic_losses = [] """Delay the learning rate""" # epoch_actor_lr = actor_lr/delay_rate # epoch_critic_lr = critic_lr/delay_rate epoch_start_time = time.time() force_array = np.zeros((150, 6)) for cycle in range(nb_epoch_cycles): """environment reset """ agent.reset() obs = env.reset() episode_reward = 0. done = False rollout_start_time = time.time() forcemoments = [] for t_rollout in range(nb_rollout_steps): """Predict next action""" action, q = agent.pi(obs, apply_noise=False, compute_Q=True) assert action.shape[0] == env.action_dim """scale for execution in env""" new_obs, r, done, info = env.step(action, t_rollout) logger.info("The maximum force:" + str(max(abs(new_obs[0:3]))) + " The maximum moments:" + str(max(abs(new_obs[3:6])))) episode_reward += r force_array[t_rollout, :] = new_obs[0:6] """Plot the force and moments""" if render: forcemoments.append(new_obs[0:6]) Force_moments.append(new_obs[0:6]) env.plot_force(forcemoments, t_rollout + 1) epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs """Episode done and start pull the pegs step by step""" if done: logger.info('Peg-in-hole assembly done!!!') epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(t_rollout) epoch_episodes += 1 pull_done = False pull_safe = True while pull_done is False and pull_safe: pull_done, pull_safe = env.pull_up() # True env # if pull_safe is False: # logger.info('###############################################') # logger.info('Pull up the pegs failed for the exceed force!!!') # exit() break """Episode failed and start pull the pegs step by step""" if info is False: logger.info( 'Peg-in-hole assembly failed for the exceed force!!!' ) pull_done = False pull_safe = True while pull_done is False and pull_safe: pull_done, pull_safe = env.pull_up() # True env # if pull_safe is False: # logger.info('###############################################') # logger.info('Peg-in-hole assembly failed for the exceed force!!!') # exit() break total_episodes += 1 roluout_time = time.time() - rollout_start_time mean_rollout_time.append(roluout_time) Force_moments = np.concatenate((Force_moments, force_array), axis=0) if t_rollout == nb_rollout_steps - 1: logger.info( 'Peg-in-hole assembly failed for exceed steps!!!') logger.info('The deepest position'.format(obs[8])) """train model for nb_train_steps times""" # for t_train in range(nb_train_steps): # cl, al = agent.train(epoch_actor_lr, epoch_critic_lr) # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() """Save the force figure""" env.save_figure(model_directory + str(cycle) + 'Force_figure.eps') """Save the memory data""" # agent.save_data() """Adapt param noise, if necessary""" # if memory.nb_entries >= batch_size and param_noise is not None: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) """write the result into the summary""" # agent.log_scalar("actor_loss", mpi_mean(epoch_actor_losses), epoch_episodes) # agent.log_scalar("critic_loss", mpi_mean(epoch_critic_losses), epoch_episodes) # agent.log_scalar("episode_score", mpi_mean(epoch_episode_rewards), epoch_episodes) # agent.log_scalar("episode_steps", mpi_mean(epoch_episode_steps), epoch_episodes) """Log stats.""" epoch_train_duration = time.time() - epoch_start_time mean_epoch_time.append(epoch_train_duration) """Successful rate""" successful_rate.append(epoch_episodes / total_episodes) stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) """Rollout statistics. compute the mean of the total nb_epoch_cycles""" combined_stats['rollout/rewards'] = mpi_mean(epoch_episode_rewards) mean_epoch_rewards.append(mpi_mean(epoch_episode_rewards)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) mean_epoch_steps.append(mpi_mean(epoch_episode_steps)) # combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) """Train statistics""" # combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) # combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) # combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances) """Total statistics""" combined_stats['total/episodes'] = mpi_sum(epoch_episodes) combined_stats['total/epochs'] = epoch + 1 """Plot reward and steps""" # env.plot_rewards(epoch_episode_rewards, epoch_episodes) # env.plot_steps(epoch_episode_steps, epoch_episodes) """save the model and the result""" # saver.save(sess, model_directory + 'model_truth_general') """Save data""" pd_epoch_train_duration = pd.DataFrame(mean_epoch_time) pd_epoch_train_duration.to_csv( 'data/large_duration_evaluation_before', sep=',', header=False, index=False) pd_rollout_time = pd.DataFrame(mean_rollout_time) pd_rollout_time.to_csv('data/large_rollout_time_evaluation_before', sep=',', header=False, index=False) # pd_successful_rate = pd.DataFrame(successful_rate) # pd_successful_rate.to_csv('data/successful_rate_evaluation_fail_1', sep=',', header=False, index=False) pd_Force_and_moments = pd.DataFrame(Force_moments) pd_Force_and_moments.to_csv( "data/large_force_moments_evaluation_before", sep=',', header=False, index=False) re_rewards = pd.DataFrame(epoch_episode_rewards) re_rewards.to_csv("data/large_re_true_rewards_evaluation_before", sep=',', header=False, index=False) re_steps = pd.DataFrame(epoch_episode_steps) re_steps.to_csv("data/large_re_true_steps_evaluation_before", sep=',', header=False, index=False) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, reward_scale, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, render=False, render_eval=False): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: #saver.restore(sess, "C:/Users/AN95540/Desktop/ICRA2019/Codes/RobotPath-ddpg/model/epoch_33.ckpt") # Prepare everything. agent.initialize(sess) #agent.continue_sess(sess) sess.graph.finalize() agent.reset() obs = env.reset() info = {'near_collision': False, 'near_limits': False} if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): print('epoch:', epoch) print('cycle:', cycle) print('rollout:', t_rollout) if epoch > 5: time.sleep(0.1) # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. #if rank == 0 and render: #env.render() assert max_action.shape == action.shape #new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) #print(obs) if info['near_collision'] or info[ 'near_limits'] or epoch > 5: new_obs, r, done, info = env.step(max_action * action) else: action = (obs[0:6] - obs[6:12]) * 0.1 + np.random.rand(6) * 0.1 new_obs, r, done, info = env.step(action) t += 1 if episode_step > 1000: done = True #if rank == 0 and render: # env.render() episode_reward += r print('episode_reward:', episode_reward) episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. print('**********END OF EPISODE**********') epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): #print('t_train:',t_train) # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] eval_env = None if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['epoch'] = epoch combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t save_path = saver.save(sess, "./model/epoch_" + str(epoch) + ".ckpt") print(save_path) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, controller=None, param_noise_adaption_interval=50, restore=True): rank = MPI.COMM_WORLD.Get_rank() #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. #max_action = env.action_space.high max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, # gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, # batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, # actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, # reward_scale=reward_scalei) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, action_range=(0., 1.)) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver(max_to_keep=600) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. if restore == True: logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(savingModelPath + "ddpg_test_model.meta") saver.restore(sess, tf.train.latest_checkpoint(savingModelPath)) else: logger.info("Strarting from scratch!") sess.run(tf.global_variables_initializer()) agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. #episode_pose = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] #epoch_episode_poses = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_act = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) #assert action.shape == env.action_space.shape assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape controller.assign_param(max_action * action) act = controller.control(obs) #act = np.clip(act, env.action_space.low, env.action_space.high) new_obs, r, done, info = env.step( act ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 #r += -.1*(act**2) #pose = to_angle_square(new_obs) if rank == 0 and render: env.render() episode_reward += r #episode_pose += pose episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_act.append(act) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) #epoch_episode_poses.append(episode_pose) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_pose = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] #eval_episode_poses = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. eval_episode_pose = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) controller.assign_param(max_action * eval_action) eval_act = controller.control(eval_obs) #eval_act = np.clip(eval_act, eval_env.action_space.low, eval_env.action_space.high) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_act ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) #eval_r += -.1*(eval_act**2) #eval_p = to_angle_square(eval_obs) if render_eval: eval_env.render() eval_episode_reward += eval_r #eval_episode_pose += eval_p eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) #eval_episode_poses.append(eval_episode_pose) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. #eval_episode_pose = 0. if saver is not None: logger.info("saving the trained model") start_time_save = time.time() saver.save( sess, savingModelPath + str( (epoch + 1) * (cycle + 1)) + '/' + "ddpg_test_model") logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) #combined_stats['rollout/return_pose'] = np.mean(epoch_episode_poses) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) total = 0 for params in epoch_actions: total += params total = max_action * total KTH_means = total[0] / len(epoch_actions) KE_means = total[1] / len(epoch_actions) #Kd_means = total[2]/len(epoch_actions) combined_stats['rollout/KTH_mean'] = KTH_means combined_stats['rollout/KE_mean'] = KE_means #combined_stats['rollout/Target_speed_mean'] = Kd_means combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) combined_stats['rollout/act_mean'] = np.mean(epoch_act) combined_stats['rollout/act_std'] = np.std(epoch_act) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) #combined_stats['eval/return_pose'] = np.mean(eval_episode_poses) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q_mean'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x #else: # raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)