def ddpg(episode, breaking_step, reward_name): env = gym.make('AntPyBulletEnv-v0') cumulus_steps = 0 episode_steps = 0 # randomly initialize critics and actor with weights and biases q1 = CriticNN() q1.compile(optimizer=Adam(learning_rate=0.001), loss='mse') q2 = CriticNN() q2.compile(optimizer=Adam(learning_rate=0.001), loss='mse') mu = ActorNN(env.action_space.shape[0]) mu.compile(optimizer=Adam(learning_rate=0.001), loss='mse') # initialize target networks q1_target = CriticNN() q1_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse') q2_target = CriticNN() q2_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse') mu_target = ActorNN(env.action_space.shape[0]) mu_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse') q1_target, q2_target, mu_target = update_network_parameters( q1, q1_target, q2, q2_target, mu, mu_target, 0.005) # initialize replay buffer (actor critic train only after batch is full 64!) replay_buffer = ReplayBuffer(1000000, env.observation_space.shape[0], env.action_space.shape[0]) performance = [] avg_return = [] time_step_reward = [] avg_time_step_reward = [] a_c = 0 b_c = 0 c_c = 0 d_c = 0 e_c = 0 f_c = 0 for e in range(episode): # receive initial observation state s1 (observation = s1) # env.render() observation = env.reset() state = tf.convert_to_tensor([observation], dtype=tf.float32) max_steps = 1000 min_action = env.action_space.low[0] max_action = env.action_space.high[0] update_frequency = 2 learn_count = 0 score = 0 for i in range(max_steps): # select an action a_t = mu(state) + noise noise = NormalActionNoise(0, 0.1) if cumulus_steps < 900: action = env.action_space.sample() else: action = mu(state) + np.random.normal(noise.mean, noise.sigma) proto_tensor = tf.make_tensor_proto(action) action = tf.make_ndarray(proto_tensor) action = action[0] # execute action a_t and observe reward, and next state action[2] = 0 action[3] = 0 next_state, reward, done, _ = env.step(action) reward_list = env.env.rewards z_pos = env.env.robot.body_xyz[2] fwp = reward_list[1] if fwp > 0: reward = reward + fwp * z_pos # store transition in replay buffer replay_buffer.store_transition(state, action, reward, next_state, done) # if there are enough transitions in the replay buffer batch_size = 100 if replay_buffer.mem_cntr >= batch_size: # sample a random mini batch of n=64 transitions buff_state, buff_action, buff_reward, buff_next_state, buff_done = replay_buffer.sample_buffer( batch_size) states = tf.convert_to_tensor(buff_state, dtype=tf.float32) next_states = tf.convert_to_tensor(buff_next_state, dtype=tf.float32) rewards = tf.convert_to_tensor(buff_reward, dtype=tf.float32) actions = tf.convert_to_tensor(buff_action, dtype=tf.float32) # train critics with tf.GradientTape(persistent=True) as tape: # calculate which actions target_actor chooses and add noise target_actions = mu_target(next_states) + tf.clip_by_value( np.random.normal(scale=0.2), -0.5, 0.5) target_actions = tf.clip_by_value(target_actions, min_action, max_action) # calculate next_q_values of the critic by feeding the next state and from actor chosen actions next_critic_value1 = tf.squeeze( q1_target(next_states, target_actions), 1) next_critic_value2 = tf.squeeze( q2_target(next_states, target_actions), 1) # calculate q values of critic actual state critic_value1 = tf.squeeze(q1(states, actions), 1) critic_value2 = tf.squeeze(q2(states, actions), 1) # use smaller q value from the 2 critics next_critic_value = tf.math.minimum( next_critic_value1, next_critic_value2) # calculate target values: yt = rt + gamma * q_target(s_t+1, mu_target(s_t+1)); with t = time step y = rewards + 0.99 * next_critic_value * (1 - buff_done) # calculate the loss between critic and target_critic critic1_loss = keras.losses.MSE(y, critic_value1) critic2_loss = keras.losses.MSE(y, critic_value2) # update critics by minimized the loss (critic_loss) and using Adam optimizer critic1_network_gradient = tape.gradient( critic1_loss, q1.trainable_variables) critic2_network_gradient = tape.gradient( critic2_loss, q2.trainable_variables) q1.optimizer.apply_gradients( zip(critic1_network_gradient, q1.trainable_variables)) q2.optimizer.apply_gradients( zip(critic2_network_gradient, q2.trainable_variables)) learn_count += 1 # train actor if learn_count % update_frequency == 0: with tf.GradientTape() as tape: new_policy_actions = mu(states) # check if - or + (descent or ascent) not sure yet actor_loss = -q1(states, new_policy_actions) actor_loss = tf.math.reduce_mean(actor_loss) # update the actor policy using the sampled policy gradient actor_network_gradient = tape.gradient( actor_loss, mu.trainable_variables) mu.optimizer.apply_gradients( zip(actor_network_gradient, mu.trainable_variables)) # update the target networks update_network_parameters(q1, q1_target, q2, q2_target, mu, mu_target, 0.005) time_step_reward.append(reward) avg_time_step_reward_short = np.mean(time_step_reward[-50:]) avg_time_step_reward.append(avg_time_step_reward_short) if done: performance.append(score) avg_reward = np.mean(performance[-50:]) avg_return.append(avg_reward) cumulus_steps += i print( "episode: {}/{}, score: {}, avg_score: {}, ep_steps: {}, cumulus_steps: {}" .format(e, episode, score, avg_reward, i, cumulus_steps)) if 10000 < cumulus_steps < 11000 and a_c == 0: a_c = 1 if not os.path.exists( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)): os.mkdir( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)) mu.save_weights( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5" .format(reward_name, cumulus_steps)) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}" .format(reward_name, cumulus_steps), avg_return) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}" .format(reward_name, cumulus_steps), time_step_reward) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}" .format(reward_name, cumulus_steps), performance) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}" .format(reward_name, cumulus_steps), avg_time_step_reward) if 150000 < cumulus_steps < 151000 and b_c == 0: b_c = 1 if not os.path.exists( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)): os.mkdir( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)) mu.save_weights( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5" .format(reward_name, cumulus_steps)) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}" .format(reward_name, cumulus_steps), avg_return) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}" .format(reward_name, cumulus_steps), time_step_reward) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}" .format(reward_name, cumulus_steps), performance) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}" .format(reward_name, cumulus_steps), avg_time_step_reward) # if 350000 < cumulus_steps < 351000 and c_c == 0: # c_c = 1 # if not os.path.exists("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}".format(reward_name)): # os.mkdir("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}".format(reward_name)) # mu.save_weights("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5".format(reward_name, cumulus_steps)) # np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}".format(reward_name, cumulus_steps), avg_return) # np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}".format(reward_name, cumulus_steps), time_step_reward) # np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}".format(reward_name, cumulus_steps), performance) # np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}".format(reward_name, cumulus_steps), avg_time_step_reward) if 550000 < cumulus_steps < 551000 and d_c == 0: d_c = 1 if not os.path.exists( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)): os.mkdir( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)) mu.save_weights( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5" .format(reward_name, cumulus_steps)) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}" .format(reward_name, cumulus_steps), avg_return) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}" .format(reward_name, cumulus_steps), time_step_reward) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}" .format(reward_name, cumulus_steps), performance) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}" .format(reward_name, cumulus_steps), avg_time_step_reward) if 750000 < cumulus_steps < 751000 and e_c == 0: e_c = 1 if not os.path.exists( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)): os.mkdir( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}" .format(reward_name)) mu.save_weights( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5" .format(reward_name, cumulus_steps)) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}" .format(reward_name, cumulus_steps), avg_return) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}" .format(reward_name, cumulus_steps), time_step_reward) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}" .format(reward_name, cumulus_steps), performance) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}" .format(reward_name, cumulus_steps), avg_time_step_reward) if 1000000 < cumulus_steps < 1001000 and f_c == 0: f_c = 1 mu.save_weights( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5" .format(reward_name, cumulus_steps)) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}" .format(reward_name, cumulus_steps), avg_return) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}" .format(reward_name, cumulus_steps), time_step_reward) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}" .format(reward_name, cumulus_steps), performance) np.save( "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}" .format(reward_name, cumulus_steps), avg_time_step_reward) break score += reward state = tf.convert_to_tensor([next_state], dtype=tf.float32) # stop learning after certain time steps if cumulus_steps > breaking_step: break return avg_return, mu, performance, time_step_reward, avg_time_step_reward
def learn( env, seed=None, total_timesteps=1e6, nb_epochs=None, # with default settings, perform 1M steps total nb_rollout_steps=100, max_ep_len=250, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, start_steps=10000, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, nb_log_steps=None, nb_save_steps=None, batch_size=64, # per MPI worker polyak=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), target_noise=0.2, noise_clip=0.5, policy_delay=2, eval_env=None, load_path=None, save_dir=None, **network_kwargs): set_global_seeds(seed) if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 memory = Memory(limit=int(1e6)) network_spec = [{ 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': 'tanh', 'nodes_in': ['main'], 'nodes_out': ['main'] }] vnetwork_spec = [{ 'layer_type': 'concat', 'nodes_in': ['action_movement', 'observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': '', 'nodes_in': ['main'], 'nodes_out': ['main'] }] network = Td3Policy(scope="td3", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, action_range=action_range, target_noise=target_noise, noise_clip=noise_clip) target_network = Td3Policy(scope="target", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, action_range=action_range, target_noise=target_noise, noise_clip=noise_clip, isTarget=True) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = NormalActionNoise(mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) elif 'ou' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = OrnsteinUhlenbeckActionNoise( mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_range[1] logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = TD3(env, network, target_network, memory, env.action_space, env.observation_space, steps_per_epoch=nb_rollout_steps, epochs=nb_epochs, gamma=gamma, polyak=polyak, actor_lr=actor_lr, critic_lr=critic_lr, batch_size=batch_size, start_steps=start_steps, action_noise=action_noise, target_noise=target_noise, noise_clip=noise_clip, policy_delay=policy_delay) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() saver = functools.partial(save_variables, sess=sess) loader = functools.partial(load_variables, sess=sess) if load_path != None: loader(load_path) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = env.num_envs n_agents = obs['observation_self'].shape[0] episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for t in range(int(total_timesteps)): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents] } nenvs_actions.append(nenv_action) else: action, q = env.action_space.sample(), None nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents][0] } nenvs_actions.append(nenv_action) new_obs, r, done, info = env.step(nenvs_actions) episode_reward += r episode_step += 1 for d in range(len(done)): done[d] = False if episode_step == max_ep_len else done[d] epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() episode_actor_losses = [] episode_critic_losses = [] episode_critic = [] episode_critic_twin = [] if d or (episode_step[0] == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(episode_step[0]): critic_loss, critic, critic_twin, actor_loss = agent.train( episode_step[0]) episode_critic_losses.append(critic_loss) episode_critic.append(critic) episode_critic_twin.append(critic_twin) if actor_loss is not None: episode_actor_losses.append(actor_loss) obs, r, done, episode_reward, episode_step = env.reset( ), 0, False, np.zeros((nenvs, n_agents), dtype=np.float32), np.zeros(nenvs, dtype=int) if (t + 1) % nb_log_steps == 0: # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['train/loss_actor'] = np.mean(episode_actor_losses) combined_stats['train/loss_critic'] = np.mean( episode_critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) if nb_save_steps != None and (t + 1) % nb_save_steps == 0: if save_dir == None: checkdir = osp.join(logger.get_dir(), 'checkpoints') else: checkdir = osp.join(save_dir, 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % t) print('Saving to', savepath) saver(savepath) return agent
def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda()
class DDPG: def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda() def test(self, render=False, record=True, slow_t=0): dist, succ_rate = self.rollout(render=render, record=record, slow_t=slow_t) print('Final step distance: ', dist) def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step) def train_net(self): batch_data = self.memory.sample(batch_size=self.batch_size) for key, value in batch_data.items(): batch_data[key] = torch.from_numpy(value) obs0_t = batch_data['obs0'] obs1_t = batch_data['obs1'] obs0_t = self.normalize(obs0_t, self.obs_oms) obs1_t = self.normalize(obs1_t, self.obs_oms) obs0 = Variable(obs0_t).float().cuda() with torch.no_grad(): vol_obs1 = Variable(obs1_t).float().cuda() rewards = Variable(batch_data['rewards']).float().cuda() actions = Variable(batch_data['actions']).float().cuda() terminals = Variable(batch_data['terminals1']).float().cuda() cri_q_val = self.critic(obs0, actions) with torch.no_grad(): target_net_act = self.actor_target(vol_obs1) target_net_q_val = self.critic_target(vol_obs1, target_net_act) # target_net_q_val.volatile = False target_q_label = rewards target_q_label += self.gamma * target_net_q_val * (1 - terminals) target_q_label = target_q_label.detach() self.actor.zero_grad() self.critic.zero_grad() cri_loss = self.critic_loss(cri_q_val, target_q_label) cri_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optim.step() self.critic.zero_grad() self.actor.zero_grad() net_act = self.actor(obs0) net_q_val = self.critic(obs0, net_act) act_loss = -net_q_val.mean() act_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optim.step() self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy() def normalize(self, x, stats): if stats is None: return x return (x - stats.mean) / stats.std def denormalize(self, x, stats): if stats is None: return x return x * stats.std + stats.mean def net_mode(self, train=True): if train: self.actor.train() self.critic.train() else: self.actor.eval() self.critic.eval() def load_model(self, step=None, pretrain_dir=None): model_dir = self.model_dir if pretrain_dir is not None: ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth') else: if step is None: ckpt_file = os.path.join(model_dir, 'model_best.pth') else: ckpt_file = os.path.join(model_dir, 'ckpt_{:08d}.pth'.format(step)) if not os.path.isfile(ckpt_file): raise ValueError("No checkpoint found at '{}'".format(ckpt_file)) mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file)) checkpoint = torch.load(ckpt_file) if pretrain_dir is not None: actor_dict = self.actor.state_dict() critic_dict = self.critic.state_dict() actor_pretrained_dict = { k: v for k, v in checkpoint['actor_state_dict'].items() if k in actor_dict } critic_pretrained_dict = { k: v for k, v in checkpoint['critic_state_dict'].items() if k in critic_dict } actor_dict.update(actor_pretrained_dict) critic_dict.update(critic_pretrained_dict) self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) self.global_step = 0 else: self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.global_step = checkpoint['global_step'] if step is None: mutils.print_yellow('Checkpoint step: {}' ''.format(checkpoint['ckpt_step'])) self.warmup_iter += self.global_step mutils.print_yellow('Checkpoint loaded...') def save_model(self, is_best, step=None): if step is None: step = self.global_step ckpt_file = os.path.join(self.model_dir, 'ckpt_{:08d}.pth'.format(step)) data_to_save = { 'ckpt_step': step, 'global_step': self.global_step, 'actor_state_dict': self.actor.state_dict(), 'actor_optimizer': self.actor_optim.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optim.state_dict() } mutils.print_yellow('Saving checkpoint: %s' % ckpt_file) torch.save(data_to_save, ckpt_file) if is_best: torch.save(data_to_save, os.path.join(self.model_dir, 'model_best.pth')) def rollout(self, train_test=False, render=False, record=False, slow_t=0): test_conditions = self.env.train_test_conditions \ if train_test else self.env.test_conditions done_num = 0 final_dist = [] episode_length = [] for idx in range(test_conditions): if train_test: obs = self.env.train_test_reset(cond=idx) else: obs = self.env.test_reset(cond=idx) for t_rollout in range(self.rollout_steps): obs = obs[0].copy() act = self.policy(obs, stochastic=False).flatten() obs, r, done, info = self.env.step(act) if render: self.env.render() if slow_t > 0: time.sleep(slow_t) if done: done_num += 1 break if record: print('dist: ', info['dist']) final_dist.append(info['dist']) episode_length.append(t_rollout) final_dist = np.array(final_dist) mean_final_dist = np.mean(final_dist) succ_rate = done_num / float(test_conditions) if record: with open('./test_data.json', 'w') as f: json.dump(final_dist.tolist(), f) print('\nDist statistics:') print("Minimum: {0:9.4f} Maximum: {1:9.4f}" "".format(np.min(final_dist), np.max(final_dist))) print("Mean: {0:9.4f}".format(mean_final_dist)) print("Standard Deviation: {0:9.4f}".format(np.std(final_dist))) print("Median: {0:9.4f}".format(np.median(final_dist))) print("First quartile: {0:9.4f}" "".format(np.percentile(final_dist, 25))) print("Third quartile: {0:9.4f}" "".format(np.percentile(final_dist, 75))) print('Success rate:', succ_rate) if render: while True: self.env.render() return mean_final_dist, succ_rate def log_model_weights(self): for name, param in self.actor.named_parameters(): logger.logkv('actor/' + name, param.clone().cpu().data.numpy()) for name, param in self.actor_target.named_parameters(): logger.logkv('actor_target/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic.named_parameters(): logger.logkv('critic/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic_target.named_parameters(): logger.logkv('critic_target/' + name, param.clone().cpu().data.numpy()) def random_action(self): act = np.random.uniform(-1., 1., self.ac_dim) return act def policy(self, obs, stochastic=True): self.actor.eval() ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1) act = self.actor(ob) act = act.cpu().data.numpy() if stochastic: act = self.action_noise(act) self.actor.train() return act def cuda(self): self.critic.cuda() self.actor.cuda() if hasattr(self, 'critic_target'): self.critic_target.cuda() self.actor_target.cuda() self.critic_loss.cuda() def construct_optim(self, net, lr, weight_decay=None): if weight_decay is None: weight_decay = 0 params = mutils.add_weight_decay([net], weight_decay=weight_decay) optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay) return optimizer def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class DDPG: def __init__(self, actor_state_size, actor_action_size, critic_state_size, critic_action_size, **kwargs): if 'filename' in kwargs.keys(): data= torch.load(kwargs['filename']) self.config= data["config"] self.scores= data["scores"] elif 'config' in kwargs.keys(): self.config= kwargs['config'] data= {} self.scores= [] else: raise OSError('DDPG: no configuration parameter in class init') self.actor_state_size = actor_state_size self.actor_action_size = actor_action_size self.critic_state_size = critic_state_size self.critic_action_size = critic_action_size memory_size = self.config.get("memory_size", 100000) actor_lr = self.config.get("actor_lr", 1e-3) critic_lr = self.config.get("critic_lr", 1e-3) self.batch_size = self.config.get("batch_size", 256) self.discount = self.config.get("discount", 0.9) sigma = self.config.get("sigma", 0.2) self.tau= self.config.get("tau", 0.001) self.seed = self.config.get("seed", 0) self.action_noise= self.config.get("action_noise", "No") self.critic_l2_reg= self.config.get("critic_l2_reg", 0.0) random.seed(self.seed) torch.manual_seed(self.seed) param_noise= False if self.action_noise== "Param": param_noise= True self.actor = Actor(actor_state_size, actor_action_size, nodes= self.config["actor_nodes"], seed= self.seed, param_noise= param_noise).to(device) self.critic = Critic(critic_state_size, critic_action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device) self.targetActor = Actor(actor_state_size, actor_action_size, nodes= self.config["actor_nodes"], seed= self.seed, param_noise= param_noise).to(device) self.targetCritic = Critic(critic_state_size, critic_action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device) # Initialize parameters self.hard_update(self.actor, self.targetActor) self.hard_update(self.critic, self.targetCritic) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr= actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr= critic_lr, weight_decay= self.critic_l2_reg) self.criticLoss = nn.MSELoss() #nn.SmoothL1Loss() #self.criticLoss = nn.SmoothL1Loss() #self.noise= None self.noise = NoNoise() if self.action_noise== "OU": self.noise = OUNoise(np.zeros(actor_action_size), sigma= sigma) elif self.action_noise== "No": self.noise = NoNoise() elif self.action_noise== "Normal": self.noise = NormalActionNoise(np.zeros(actor_action_size), sigma= sigma) self.memory = Memory(memory_size, self.batch_size, self.seed) def hard_update(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def act(self, state, add_noise= True): """Returns actions for given state as per current policy.""" self.actor.resample() #state = torch.from_numpy(state).float().to(device) #state= torch.FloatTensor(state).view(1, -1).to(device) #state= torch.FloatTensor(state).unsqueeze(0).to(device) state= torch.FloatTensor(state).to(device) if len(state.size())== 1: state= state.unsqueeze(0) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() if add_noise and self.noise: action += self.noise() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add((state, action, reward, next_state, done)) if len(self.memory) >= self.batch_size: self.learn() def learn_critic(self, states, actions, rewards, next_states, dones, actions_next): # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models #actions_next = self.targetActor(next_states) Q_targets_next = self.targetCritic(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones)) Q_targets = Variable(Q_targets.data, requires_grad=False) # Compute critic loss Q_expected = self.critic(states, actions) critic_loss = self.criticLoss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.targetCritic, self.tau) #def learn_actor(self, states, actions, rewards, next_states, dones, actions_pred): def learn_actor(self, states, actions_pred): # ---------------------------- update actor ---------------------------- # # Compute actor loss #actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.actor, self.targetActor, self.tau) def learn(self): states, actions, rewards, next_states, dones = self.memory.sample() self.learn_critic(states, actions, rewards, next_states, dones, self.targetActor(next_states)) self.learn_actor( states, self.actor(states)) def reset(self): self.noise.reset() def update(self, score= None): if score: self.scores.append(score) def save(self, filename= None): data= {"config": self.config, "actor": self.actor.state_dict(), "scores": self.scores,} if not filename: filename= self.__class__.__name__+ '_'+ datetime.now().strftime("%Y-%m-%d_%H:%M:%S")+ '.data' torch.save(data, filename) torch.save(self.actor.state_dict(), "last_actor.pth")
eval_env = gym.make(args.env) #eval_env.seed(args.seed+100) if logger.get_dir(): eval_env = bench.Monitor( eval_env, os.path.join(logger.get_dir(), "eval.monitor.json")) max_timesteps = train_env.spec.timestep_limit # set noise type current_noise_type = args.noise_type.strip() nb_actions = train_env.action_space.shape[0] if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) action_noise.reset() if 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) action_noise.reset() else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) episode_rewards = [] if 'Sparse' in train_env.spec.id: sparse = True