def retraining( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=4, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', noise_type='normal_0.2', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-4, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 #load the initialization policy agent.load_ini(sess, save_path) # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] '''check if the actor initialization policy has been loaded correctly, i.e. equal to directly ouput values in checkpoint files ''' # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0') # print('loaded_weights:', sess.run(loaded_weights)) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) print('action:', action) new_obs, r, done = env.step(action) # time.sleep(0.2) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs epoch_episode_rewards.append(episode_reward) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set, mean_epoch_episode_rewards, color='r', label='Initialization') plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_retrain.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) return agent
def testing(save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape=np.array(nb_actions*[0]).shape nb_features = 2*(env.num_actions+1)+env.num_actions observation_shape=np.array(nb_features*[0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.load(sess,save_path) # sess.graph.finalize() # cannot save sess if its finalized! agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar step_set=[] reward_set=[] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. '''no noise for test''' action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) # print('action:', action) # Execute next action. # if rank == 0 and render: # env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # new_obs, r, env_state,done = env.step(action, env_state) '''actually no need for env_state: in or out''' new_obs, r, done = env.step(action) # print('reward:', r) # note these outputs are batched from vecenv # print('obs: ',obs.shape,obs, 'action: ', action.shape, action ) '''obs shape: (1,17), action shape: (1,6)''' # print('maxaction: ', max_action.shape) '''max_action shape: (6,) , max_action*action shape: (1,6)''' t += 1 # if rank == 0 and render: # env.render() # print('r:', r) episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b=1. agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs # for d in range(len(done)): # if done[d]: # print('done') # # Episode done. # epoch_episode_rewards.append(episode_reward[d]) # episode_rewards_history.append(episode_reward[d]) # epoch_episode_steps.append(episode_step[d]) # episode_reward[d] = 0. # episode_step[d] = 0 # epoch_episodes += 1 # episodes += 1 # if nenvs == 1: # agent.reset() '''added''' epoch_episode_rewards.append(episode_reward) ''' step_set.append(t) reward_set=np.concatenate((reward_set,episode_reward)) # print(step_set,reward_set) # print(t, episode_reward) plt.plot(step_set,reward_set) plt.xlabel('Steps') plt.ylabel('Episode Reward') plt.savefig('ddpg.png') plt.show() ''' episode_reward = np.zeros(nenvs, dtype = np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] '''no training for test''' # for t_train in range(nb_train_steps): # Adapt param noise, if necessary. no noise for test! # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set,mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_test.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def testing( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, # **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] # nb_actions = 2*env.grid_size nb_actions = env.grid_size action_shape = np.array(nb_actions * [0]).shape nb_features = (4 + 1) * env.grid_size observation_shape = np.array(nb_features * [0]).shape grid_x = env.grid_x grid_y = env.grid_y x = [] y = [] for i in range(grid_x): x.append(i + 1) for i in range(grid_y): y.append(i + 1) # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. # agent.initialize(sess) # sess.graph.finalize() agent.load(sess, save_path) agent.reset() obs, env_state = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] average_reward = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_state = [] epoch_episodes = 0 #record the car numbers in each step car_num_set = {} t_set = [i for i in range(total_timesteps)] for xx in x: for yy in y: lab = str(xx) + str(yy) car_num_set[lab] = [[0 for i in range(total_timesteps)] for j in range(4)] for epoch in range(nb_epochs): obs, env_state = env.reset() epoch_actions = [] epoch_state = [] average_car_num_set = [] last_action = 1 for cycle in range(nb_epoch_cycles): # Perform rollouts. action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) '''random action''' # if np.random.rand()>0.5: # action=[1] # else: # action=[0] '''cycle light state''' # action=[0] '''cycle action (should cycle state instead of action)''' # if last_action==1: # action=[0] # else: # action=[1] # last_action=action[0] if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): new_obs, r, env_state, done = env.step(action, env_state) epoch_state.append(env_state['11'].light_state) for xx in x: for yy in y: lab = str(xx) + str(yy) for i in range(4): car_num_set[lab][i][t] = ( env_state['11'].car_nums[i]) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: print('done') # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() epoch_episode_rewards.append(episode_reward) average_reward.append(episode_reward / nb_rollout_steps) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # for t_train in range(nb_train_steps): # # Adapt param noise, if necessary. # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # # print('Train!') # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 step_set.append(t) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) # plt.figure(figsize=(8,5)) '''plot rewards-steps''' ax1 = plt.subplot(2, 1, 1) plt.sca(ax1) plt.plot(step_set, average_reward, color='b') # plt.xlabel('Steps') plt.ylabel('Mean Reward', fontsize=12) # plt.ylim(-15000,0) '''plot queueing car numbers-steps''' ax2 = plt.subplot(2, 1, 2) plt.sca(ax2) print(np.shape(t_set), np.shape(car_num_set['11'][i])) for i in range(4): if i == 0: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b') elif i == 1: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='orange') elif i == 2: plt.plot(t_set, car_num_set['11'][i], label=i, color='g') else: plt.plot(t_set, car_num_set['11'][i], label=i, color='r') plt.ylim(0, 100) #sum among roads sum_car_num = np.sum(car_num_set['11'], axis=0) #average among time steps average_car_num = np.average(sum_car_num) average_car_num_set.append(average_car_num) plt.xlabel('Steps', fontsize=12) plt.ylabel('Cars Numbers', fontsize=12) # set legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) leg = plt.legend(by_label.values(), by_label.keys(), loc=1) # leg = plt.legend(loc=4) legfm = leg.get_frame() legfm.set_edgecolor('black') # set legend fame color legfm.set_linewidth(0.5) # set legend fame linewidth plt.savefig('ddpg_mean_test.pdf') plt.show() print(epoch_state) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('average queueing car numbers: ', np.average(average_car_num_set)) return agent
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) # ---------- AMEND: specific setting for brsEngine ----------- print("kwargs", kwargs) env.reward_type = kwargs['reward_type'] env.set_additional_goal = kwargs['set_additional_goal'] kwargs.pop('reward_type', None) kwargs.pop('set_additional_goal', None) brsEngine = None if env.reward_type == 'ttr': if env_id == 'DubinsCarEnv-v0': brsEngine = DubinsCar_brs_engine() brsEngine.reset_variables() elif env_id == 'PlanarQuadEnv-v0': brsEngine = Quadrotor_brs_engine() brsEngine.reset_variables() else: raise ValueError("invalid environment name for ttr reward!") # You have to assign the engine! env.brsEngine = brsEngine # ----------------------------------------------------------- env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) # ---------- AMEND: specific setting for brsEngine ----------- eval_env.brsEngine = brsEngine # ------------------------------------------------------------ eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, observation_shape, action_shape, nb_demo_kine, nb_key_states, batch_size=128, noise_type='', actor=None, critic=None, layer_norm=True, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), normalize_returns=False, normalize_observations=True, reward_scale=1., clip_norm=None, demo_l2_reg=0., critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, demo_lr=5e-3, gamma=0.99, tau=0.001, enable_popart=False, save_ckpt=True): # Noise nb_actions = action_shape[-1] param_noise, action_noise = process_noise_type(noise_type, nb_actions) logger.info('param_noise', param_noise) logger.info('action_noise', action_noise) # States recording self.memory = Memory(limit=int(2e5), action_shape=action_shape, observation_shape=observation_shape) # Models self.nb_demo_kine = nb_demo_kine self.actor = actor or Actor( nb_actions, nb_demo_kine, layer_norm=layer_norm) self.nb_key_states = nb_key_states self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm) self.nb_obs_org = nb_key_states # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') # self.critic_target_Q: value assigned by self.target_Q_obs0 self.critic_target_Q = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target_Q') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # change in observations self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine] self.obs_delta_kstates = (self.obs1 - self.obs0)[:, :self.nb_key_states] # Parameters. self.gamma = gamma self.tau = tau self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.actor_lr = actor_lr self.critic_lr = critic_lr self.demo_lr = demo_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.demo_l2_reg = demo_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.normalized_obs0 = tf.clip_by_value( obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across set-up parts. # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0) # critic loss # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic( self.normalized_obs0, act_norm(self.actions)) # self.critic_tf: only in logging [reference_Q_mean/std] self.critic_tf = ret_denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # actor loss normalized_critic_with_actor_tf = self.critic(self.normalized_obs0, act_norm(self.actor_tf), reuse=True)[0] # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std] self.critic_with_actor_tf = ret_denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # target Q self.target_action = tf.clip_by_value( target_actor(normalized_obs1)[0], self.action_range[0], self.action_range[1]) self.target_Q_obs1 = ret_denormalize( target_critic(normalized_obs1, act_norm(self.target_action))[0], self.ret_rms) self.target_Q_obs0 = self.rewards + ( 1. - self.terminals1) * gamma * self.target_Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars self.sess = None # Set up checkpoint saver self.save_ckpt = save_ckpt if save_ckpt: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) else: # saver for loading ckpt self.saver = tf.train.Saver() self.main_summaries = tf.summary.merge_all() logdir = logger.get_dir() if logdir: self.train_writer = tf.summary.FileWriter( os.path.join(logdir, 'tb'), tf.get_default_graph()) else: self.train_writer = None
class DDPG(object): def __init__(self, observation_shape, action_shape, nb_demo_kine, nb_key_states, batch_size=128, noise_type='', actor=None, critic=None, layer_norm=True, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), normalize_returns=False, normalize_observations=True, reward_scale=1., clip_norm=None, demo_l2_reg=0., critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, demo_lr=5e-3, gamma=0.99, tau=0.001, enable_popart=False, save_ckpt=True): # Noise nb_actions = action_shape[-1] param_noise, action_noise = process_noise_type(noise_type, nb_actions) logger.info('param_noise', param_noise) logger.info('action_noise', action_noise) # States recording self.memory = Memory(limit=int(2e5), action_shape=action_shape, observation_shape=observation_shape) # Models self.nb_demo_kine = nb_demo_kine self.actor = actor or Actor( nb_actions, nb_demo_kine, layer_norm=layer_norm) self.nb_key_states = nb_key_states self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm) self.nb_obs_org = nb_key_states # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') # self.critic_target_Q: value assigned by self.target_Q_obs0 self.critic_target_Q = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target_Q') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # change in observations self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine] self.obs_delta_kstates = (self.obs1 - self.obs0)[:, :self.nb_key_states] # Parameters. self.gamma = gamma self.tau = tau self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.actor_lr = actor_lr self.critic_lr = critic_lr self.demo_lr = demo_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.demo_l2_reg = demo_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.normalized_obs0 = tf.clip_by_value( obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across set-up parts. # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0) # critic loss # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic( self.normalized_obs0, act_norm(self.actions)) # self.critic_tf: only in logging [reference_Q_mean/std] self.critic_tf = ret_denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # actor loss normalized_critic_with_actor_tf = self.critic(self.normalized_obs0, act_norm(self.actor_tf), reuse=True)[0] # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std] self.critic_with_actor_tf = ret_denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # target Q self.target_action = tf.clip_by_value( target_actor(normalized_obs1)[0], self.action_range[0], self.action_range[1]) self.target_Q_obs1 = ret_denormalize( target_critic(normalized_obs1, act_norm(self.target_action))[0], self.ret_rms) self.target_Q_obs0 = self.rewards + ( 1. - self.terminals1) * gamma * self.target_Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars self.sess = None # Set up checkpoint saver self.save_ckpt = save_ckpt if save_ckpt: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) else: # saver for loading ckpt self.saver = tf.train.Saver() self.main_summaries = tf.summary.merge_all() logdir = logger.get_dir() if logdir: self.train_writer = tf.summary.FileWriter( os.path.join(logdir, 'tb'), tf.get_default_graph()) else: self.train_writer = None def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0)[0] logger.debug('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)[0] self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') # loss_normed = -tf.reduce_mean(self.normalized_critic_with_actor_tf) self.actor_Q = tf.reduce_mean(self.critic_with_actor_tf) self.actor_loss = -self.actor_Q tf.summary.scalar('actor/Q', self.actor_Q) # setting up actor vars/grads/optimizer self.actor_vars = self.actor.active_vars self.actor_grads = tf_util.flatgrad(self.actor_loss, self.actor_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] self.actor_params = actor_params = [0] * ( len(self.actor.trainable_vars) + 1) for i, shape in enumerate(actor_shapes): actor_params[i + 1] = actor_params[i] + np.prod(shape) n_inact = len(actor_shapes) - len(self.actor_vars) active_params = actor_params[n_inact:] - actor_params[n_inact] logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_params)) logger.info(' actor total: {}'.format(actor_params[-1])) logger.info(' actor active: {}'.format(active_params)) grad = self.actor_grads[active_params[0]:active_params[1]] tf.summary.scalar( 'grads/actor_layer%d_%d' % (n_inact // 2, active_params[1] - active_params[0]), tf.reduce_mean(grad)) grad = self.actor_grads[active_params[-3]:active_params[-2]] tf.summary.scalar( 'grads/actor_layer%d_%d' % (-1, active_params[-2] - active_params[-3]), tf.reduce_mean(grad)) # for train_demo() self.demo_loss = tf.reduce_mean( tf.square(self.obs_delta_kine - self.demo_aprx)) self.demo_max_loss = tf.reduce_max( tf.square(self.obs_delta_kine - self.demo_aprx)) if self.demo_l2_reg > 0.: demo_reg_vars = self.actor.demo_reg_vars for var in demo_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info( ' applying l2 regularization for demo_aprx with {}'.format( self.demo_l2_reg)) self.demo_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.demo_l2_reg), weights_list=demo_reg_vars) self.demo_loss += self.demo_reg else: self.demo_reg = None self.demo_grads = tf_util.flatgrad(self.demo_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.demo_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # mimic rwd self.mimic_rwd = -self.demo_loss tf.summary.scalar('actor/mimic_rwd', self.mimic_rwd) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') self.normalized_critic_target_tf = tf.clip_by_value( ret_normalize(self.critic_target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - self.normalized_critic_target_tf)) tf.summary.scalar('critic_loss/Q_diff', self.critic_loss) if self.normalize_returns: tf.summary.scalar('critic_loss/Q_normed_critic', tf.reduce_mean(self.normalized_critic_tf)) tf.summary.scalar('critic_loss/Q_normed_target', tf.reduce_mean(self.normalized_critic_target_tf)) self.critic_loss_step = 0 diff_rwd = tf.reduce_mean(tf.square(self.pred_rwd - self.rewards)) self.critic_loss_step += diff_rwd tf.summary.scalar('critic_loss/step_rwd', self.critic_loss_step) critic_kine_factor = 100 diff_obs = tf.reduce_mean(tf.square(self.pred_obs_delta - self.obs_delta_kstates), axis=0) diff_obs_kine = tf.reduce_mean( diff_obs[:self.nb_demo_kine]) * critic_kine_factor diff_obs_rest = tf.reduce_mean(diff_obs[self.nb_demo_kine:]) self.critic_loss_step += (diff_obs_kine + diff_obs_rest) tf.summary.scalar( 'critic_loss/step_kstates_kine_x%d' % critic_kine_factor, diff_obs_kine) tf.summary.scalar('critic_loss/step_kstates_rest', diff_obs_rest) tf.summary.scalar('critic_loss/step_total', self.critic_loss_step) self.critic_loss += self.critic_loss_step if self.critic_l2_reg > 0.: critic_reg_vars = self.critic.reg_vars for var in critic_reg_vars: logger.debug(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg tf.summary.scalar('critic_loss/reg', critic_reg) critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_params = [0] * (len(self.critic.trainable_vars) + 1) for i, shape in enumerate(critic_shapes): critic_params[i + 1] = critic_params[i] + np.prod(shape) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_params)) logger.info(' critic total: {}'.format(critic_params[-1])) self.critic_grads = tf_util.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # todo: make the following general grad = self.critic_grads[critic_params[0]:critic_params[1]] tf.summary.scalar( 'grads/critic_layer%d_%d' % (0, critic_params[1] - critic_params[0]), tf.reduce_mean(grad)) grad = self.critic_grads[critic_params[-3]:critic_params[-2]] tf.summary.scalar( 'grads/critic_layer%d_rwd_%d' % (-1, critic_params[-2] - critic_params[-3]), tf.reduce_mean(grad)) grad = self.critic_grads[critic_params[-7]:critic_params[-6]] tf.summary.scalar( 'grads/critic_layer%d_q_%d' % (-1, critic_params[-6] - critic_params[-7]), tf.reduce_mean(grad)) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['zrms/ret_mean', 'zrms/ret_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean[:self.nb_demo_kine]), tf.reduce_mean(self.obs_rms.std[:self.nb_demo_kine]) ] names += ['zrms/obs_kine_mean', 'zrms/obs_kine_std'] ops += [ tf.reduce_mean(self.obs_rms.mean[:self.nb_key_states]), tf.reduce_mean(self.obs_rms.std[:self.nb_key_states]) ] names += ['zrms/obs_kstates_mean', 'zrms/obs_kstates_std'] ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['zrms/obs_mean', 'zrms/obs_std'] # for debugging partial normalisation for o_i in [self.nb_obs_org - 1, self.nb_obs_org]: ops += [self.obs0[0, o_i], self.normalized_obs0[0, o_i]] names += ['zobs_dbg_%d' % o_i, 'zobs_dbg_%d_normalized' % o_i] ops += [tf.reduce_mean(self.critic_tf)] names += ['zref/Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['zref/Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['zref/Q_tf_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['zref/Q_tf_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['zref/action_mean'] ops += [reduce_std(self.actor_tf)] names += ['zref/action_std'] ops += [tf.reduce_mean(self.mimic_rwd)] names += ['zref/mimic_rwd'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['zref/action_ptb_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['zref/action_ptb_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, step, apply_param_noise=True, apply_action_noise=True, compute_Q=True, rollout_log=False): if self.param_noise is not None and apply_param_noise: actor_tf = self.perturbed_actor_tf info = 'ptb' else: actor_tf = self.actor_tf info = 'org' feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() # actor output is [0,1], no need to denormalise. # action = act_denorm(action) if rollout_log: summary_list = [('the_action/%d_rollout_%s' % (i, info), a) for i, a in enumerate(action)] if self.action_noise is not None and apply_action_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise else: noise = None action = np.clip(action, self.action_range[0], self.action_range[1]) if rollout_log: if noise is not None: summary_list += [('the_action/%d_rollout_noise' % i, a) for i, a in enumerate(noise)] self.add_list_summary(summary_list, step) return action, q def store_transition(self, storage, obs0, action, reward, obs1, terminal1): '''store one experience''' reward *= self.reward_scale storage.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def store_multrans(self, storage, obs0, action, reward, obs1, terminal1): '''store multiple experiences''' for i in range(len(reward)): storage.append(obs0[i], action[i], reward[i] * self.reward_scale, obs1[i], terminal1[i]) if self.normalize_observations: self.obs_rms.update(np.vstack(obs0)) def train_demo(self, obs0_pos, obs1_pos, obs0_neg, obs1_neg, step, neg_pct=1.0, lr_decay=1.0): # gradients calculated for pos and neg data separately, then combined for gradient update, # because only positive data are used in eval modes # the loss evaluated here are those before gradient update ops = [ self.demo_grads, self.demo_loss, self.demo_max_loss, self.actor_Q ] pos_grads, demo_loss, max_loss, actor_Q = self.sess.run(ops, feed_dict={ self.obs0: obs0_pos, self.obs1: obs1_pos, }) ops = [self.demo_grads, self.demo_loss] neg_grads, neg_loss = self.sess.run(ops, feed_dict={ self.obs0: obs0_neg, self.obs1: obs1_neg, }) comb_grads = pos_grads - neg_grads * neg_pct self.demo_optimizer.update(comb_grads, stepsize=self.demo_lr * lr_decay) if self.demo_reg is not None: demo_reg = self.sess.run(self.demo_reg) else: demo_reg = 0 # sanity check the training pos_g = pos_grads[self.actor_params[2]:self.actor_params[3]] neg_g = neg_grads[self.actor_params[2]:self.actor_params[3]] comb_g = comb_grads[self.actor_params[2]:self.actor_params[3]] summary_list = [ ('demo_loss/train_pos', demo_loss), ('demo_loss/train_max', max_loss), ('demo_loss/train_neg', neg_loss), ('grads/demo_pos_layer%d_%d' % (1, len(pos_g)), np.mean(pos_g)), ('grads/demo_neg_layer%d_%d' % (1, len(neg_g)), np.mean(neg_g)), ('grads/demo_comb_layer%d_%d' % (1, len(comb_g)), np.mean(comb_g)), ('actor/Q', actor_Q), ('demo_loss/reg', demo_reg) ] self.add_list_summary(summary_list, step) return demo_loss def test_demo(self, obs0, obs1): loss_mean, loss_max = self.sess.run( [self.demo_loss, self.demo_max_loss], feed_dict={ self.obs0: obs0, self.obs1: obs1, }) return loss_mean, loss_max def eval_demo(self, obs0): return self.sess.run(self.demo_aprx, feed_dict={self.obs0: obs0}) def get_mimic_rwd(self, obs0, obs1): mimic_rwd, demo_aprx = self.sess.run([self.mimic_rwd, self.demo_aprx], feed_dict={ self.obs0: obs0, self.obs1: obs1 }) return mimic_rwd, demo_aprx def train_main(self, step): batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: ops = [ self.ret_rms.mean, self.ret_rms.std, self.target_Q_obs0, self.target_Q_obs1 ] old_mean, old_std, target_Q_obs0, target_Q_obs1 = self.sess.run( ops, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q_obs0.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q_obs0, self.ret_rms.mean, self.ret_rms.std], # feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q_obs0, new_mean, new_std) # assert (np.abs(target_Q_obs0 - target_Q_new) < 1e-3).all() else: ops = [self.target_Q_obs0, self.target_Q_obs1] target_Q_obs0, target_Q_obs1 = self.sess.run( ops, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32') }) summary_list = [ ('critic_loss/Q_target_obs1_mean', np.mean(target_Q_obs1)), ('critic_loss/Q_target_obs1_std', np.std(target_Q_obs1)), ('critic_loss/Q_target_obs0_mean', np.mean(target_Q_obs0)), ('critic_loss/Q_target_obs0_std', np.std(target_Q_obs0)) ] self.add_list_summary(summary_list, step) # Get all gradients and perform a synced update. ops = [ self.main_summaries, self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] main_summaries, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target_Q: target_Q_obs0, self.rewards: batch['rewards'], self.obs1: batch['obs1'] }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) if self.train_writer: self.train_writer.add_summary(main_summaries, step) return critic_loss, actor_loss def initialize(self, sess, start_ckpt=None): self.sess = sess if start_ckpt: self.saver.restore(sess, start_ckpt) else: self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.demo_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def store_ckpt(self, save_path, epoch): if self.save_ckpt: self.saver.save(self.sess, save_path, global_step=epoch) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self, storage): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = storage.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.obs1: self.stats_sample['obs1'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self, step): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) self.add_list_summary([('param_noise/distance', mean_distance)], step) self.add_list_summary( [('param_noise/std', self.param_noise.current_stddev)], step) return mean_distance def reset(self): '''Reset internal state after an episode is complete.''' if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def add_list_summary(self, summary_raw, step): def summary_val(k, v): kwargs = {'tag': k, 'simple_value': v} return tf.Summary.Value(**kwargs) if self.train_writer: summary_list = [summary_val(tag, val) for tag, val in summary_raw] self.train_writer.add_summary(tf.Summary(value=summary_list), step)
def learn( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=7, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', # noise_type='normal_0.2', # large noise # noise_type='normal_0.02', # small noise noise_type='normal_2.0', # action ranges 360, so noise scale should be chosen properly # noise_type='normal_5', # large noise # noise_type='normal_0.2', # small noise # noise_type='normal_0.00001', # no noise # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, # large lr critic_lr=1e-3, # large lr # actor_lr=1e-7, # small lr # critic_lr=1e-3, # small lr # actor_lr = 1e-10, # no lr # critic_lr=1e-10, # no lr popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() nb_actions = env.num_actions action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] episode_end_distance = [] epoch_episodes = 0 SPARSE_REWARD = False '''add this line to make non-initialized to be initialized''' agent.load_ini(sess, save_path) for epoch in range(nb_epochs): print('epochs: ', epoch) obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # print('action:', action) if SPARSE_REWARD: new_obs, r, done, end_distance = env.step( action, SPARSE_REWARD) else: new_obs, r, done = env.step(action, SPARSE_REWARD) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs epoch_episode_rewards.append(episode_reward) if cycle == nb_epoch_cycles - 1: # record the distance from the end position of reacher to the goal for the last step of each episode if SPARSE_REWARD: episode_end_distance.append(end_distance) else: end_distance = 100.0 / r - 1 episode_end_distance.append(end_distance[0]) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # filling memory with noised initialized policy & preupdate the critic networks preheating_step = 30 #50 episode = 600 steps, 12 steps per episode if epoch > preheating_step: # print('memory_entries: ',memory.nb_entries) for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() else: # update two critic networks at start cl = agent.update_critic() epoch_critic_losses.append(cl) print('critic loss in initial training: ', cl) pass # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.figure(1) plt.plot(step_set, mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean.png') plt.figure(2) plt.plot(step_set, episode_end_distance) plt.xlabel('Steps') plt.ylabel('Distance to Target') plt.savefig('ddpgini_distance.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) print('distances: ', episode_end_distance) return agent