def __init__(self, gamma, tau,num_inputs, env,device, results_path=None): self.gamma = gamma self.tau = tau self.min_action,self.max_action = env.action_range() self.device = device self.num_actions = env.action_space() self.noise_stddev = 0.3 self.results_path = results_path self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/') os.makedirs(self.checkpoint_path, exist_ok=True) # Define the actor self.actor = Actor(num_inputs, self.num_actions).to(device) self.actor_target = Actor(num_inputs, self.num_actions).to(device) # Define the critic self.critic = Critic(num_inputs, self.num_actions).to(device) self.critic_target = Critic(num_inputs, self.num_actions).to(device) # Define the optimizers for both networks self.actor_optimizer = Adam(self.actor.parameters(), lr=1e-4 ) # optimizer for the actor network self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4, weight_decay=0.002) # optimizer for the critic network self.hard_swap() self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions), sigma=float(self.noise_stddev) * np.ones(self.num_actions)) self.ou_noise.reset()
def setup(self, nb_states, nb_actions): super(action_noise_DDPG, self).setup(nb_states, nb_actions) exploration_args = Singleton_arger()['exploration'] self.noise_decay = exploration_args['noise_decay'] self.noise_coef = 1 self.rollout_actor = copy.deepcopy(self.actor) self.action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(exploration_args['stddev']) * np.ones(nb_actions)) if self.with_cuda: for net in (self.rollout_actor, ): if net is not None: net.cuda()
def __init__(self, state_size, action_size, action_bound_high, action_bound_low, imitation_data_path): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.state_size = state_size self.action_bound_high = torch.Tensor([action_bound_high]).to(device) self.action_bound_low = torch.Tensor([action_bound_low]).to(device) self.action_size = action_size self.buffer = EfficientReplayMemory(Parameters.BUFFER_SIZE, self.state_size, self.action_size) self.imitation_buffer = EfficientReplayMemory( Parameters.IMITATION_BUFFER_SIZE, self.state_size, self.action_size) self.imitation_buffer.load_memory(imitation_data_path) self.imitation_lambda = Parameters.IMITATION_LAMBDA # Actor self.policy_function = Policy(self.state_size, self.action_size, self.action_bound_high) self.policy_function_target = Policy(self.state_size, self.action_size, self.action_bound_high) self.policy_function_noisy = Policy(self.state_size, self.action_size, self.action_bound_high) self.policy_function_optim = Adam(self.policy_function.parameters(), lr=Parameters.ACTOR_LEARNING_RATE) self.imitation_optimizer = Adam(self.policy_function.parameters(), lr=self.imitation_lambda) # critic 1 (q-value) self.q_function = QFunction(self.state_size, self.action_size) self.q_function_target = QFunction(self.state_size, self.action_size) self.q_function_optim = Adam(self.q_function.parameters(), lr=Parameters.CRITIC_LEARNING_RATE) # Noise parameters self.action_noise = OrnsteinUhlenbeckActionNoise(self.action_size) self.desired_action_std = Parameters.DESIRED_ACTION_STD self.current_noise_std = Parameters.INITIAL_NOISE_STD self.coefficient = Parameters.ADAPT_COEFFICIENT # hyperparameters self.gamma = Parameters.GAMMA self.tau = Parameters.TAU self.hard_update_network(self.policy_function_target, self.policy_function) self.hard_update_network(self.q_function_target, self.q_function)
class action_noise_DDPG(DDPG): def __init__(self): super(action_noise_DDPG, self).__init__() def setup(self, nb_pos, nb_laser, nb_actions): super(action_noise_DDPG, self).setup(nb_pos, nb_laser, nb_actions) self.nb_pos = nb_pos self.nb_laser = nb_laser exploration_args = Singleton_arger()['exploration'] self.noise_decay = exploration_args['noise_decay'] self.noise_coef = 1 self.rollout_actor = copy.deepcopy(self.actor) self.action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(exploration_args['stddev']) * np.ones(nb_actions)) if self.with_cuda: for net in (self.rollout_actor, ): if net is not None: net.cuda() def reset_noise(self): self.action_noise.reset() def before_epoch(self): self.apply_noise_decay() def apply_noise_decay(self): if self.noise_decay > 0: self.noise_coef = self.noise_decay * self.noise_coef / ( self.noise_coef + self.noise_decay) def select_action(self, s_t, apply_noise): s_t = torch.tensor(np.vstack(s_t), dtype=torch.float32, requires_grad=False).cuda() s_t = s_t.split([self.nb_pos, self.nb_laser], dim=1) #s_t = torch.tensor(s_t,dtype = torch.float32,requires_grad = False) #if self.with_cuda: # s_t = s_t.cuda() with torch.no_grad(): action = self.actor(s_t).cpu().numpy() if apply_noise: action += max(self.noise_coef, 0) * self.action_noise() action = np.clip(action, -1., 1.) return action
def __init__(self, session, state_shape, action_shape, action_bound, learning_rate, tau, batch_size): """ Initialize actor and target networks and update methods. """ self.session = session self.state_shape = state_shape self.action_shape = action_shape self.action_bound = action_bound self.learning_rate = learning_rate self.tau = tau self.batch_size = batch_size # Initialize addititve Ornstein Uhlenbeck noise self.OU_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.action_shape)) # Initialize actor network self.phase = tf.placeholder(tf.bool, name='phase_act') self.inputs, self.out, self.scaled_out = \ self.create_actor_network(self.phase) self.network_params = tf.trainable_variables() # Initialize target actor network self.target_inputs, self.target_out, self.target_scaled_out = \ self.create_actor_network(self.phase, prefix='tar_') self.target_network_params = \ tf.trainable_variables()[len(self.network_params):] # Define target update op self.update_target_network_params = \ [self.target_network_params[i].assign( tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1.0 - self.tau)) for i in range(len(self.target_network_params))] # Define ops for getting necessary gradients self.action_gradient = \ tf.placeholder(tf.float32, [None, self.action_shape]) self.unnormalized_actor_gradients = tf.gradients( self.scaled_out, self.network_params, -self.action_gradient) self.actor_gradients = list( map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients)) # Define optimization op # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # with tf.control_dependencies(update_ops): self.optimize = tf.train.AdamOptimizer(self.learning_rate).\ apply_gradients(zip(self.actor_gradients, self.network_params)) self.num_trainable_vars = \ len(self.network_params) + len(self.target_network_params)
def __init__(self, FLAGS): """ This class build the model that implements the deterministic gradient descent algorithm. :param FLAGS: TensorFlow flags which contain the values for hyperparameters """ self.FLAGS=FLAGS self.env = gym.make('Pendulum-v0') self.state_size = len(self.env.observation_space.sample()) self.num_episodes=1000 self.batch_size=64 self.exp_replay=ExperienceReplay(50000,1500, FLAGS) self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None) self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS) self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS) self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS) self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS) init = tf.global_variables_initializer() self.session = tf.InteractiveSession() self.session.run(init) self.critic.set_session(self.session) self.actor.set_session(self.session) self.actor_target.set_session(self.session) self.critic_target.set_session(self.session) self.critic.init_target_network() self.actor.init_target_network()
def __init__(self, experiment, batch_size): self._dummy_env = gym.make(experiment) self._sess = tf.Session() self._sum_writer = tf.summary.FileWriter('logs/', self._sess.graph) # Hardcoded for now self._dim_state = 25 self._dim_goal = 3 self._dim_action = self._dummy_env.action_space.shape[0] self._dim_env = 1 self._batch_size = batch_size # agent noise self._action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self._dim_action)) self._actor = Actor(self._sess, self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size) self._critic = Critic(self._sess, self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars(), self._sum_writer) self._saver = tf.train.Saver(max_to_keep=None) self._sess.run(tf.global_variables_initializer()) self._actor.initialize_target_network() self._critic.initialize_target_network() # training monitoring self._success_rate = tf.Variable(0., name="success_rate") self._python_success_rate = tf.placeholder("float32", []) self._update_success_rate = self._success_rate.assign( self._python_success_rate) self._merged = tf.summary.scalar("successrate", self._update_success_rate)
max_timesteps = train_env.spec.timestep_limit # set noise type current_noise_type = args.noise_type.strip() nb_actions = train_env.action_space.shape[0] if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) action_noise.reset() if 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) action_noise.reset() else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) episode_rewards = [] if 'Sparse' in train_env.spec.id: sparse = True episode_successes = [] else: sparse = False state_start, paths, neigh = None, None, None
def learn( env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, nb_save_epochs=None, batch_size=64, # per MPI worker tau=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), eval_env=None, load_path=None, save_dir=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 memory = Memory(limit=int(1e6)) network_spec = [{ 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': 'tanh', 'nodes_in': ['main'], 'nodes_out': ['main'] }] vnetwork_spec = [{ 'layer_type': 'concat', 'nodes_in': ['action_movement', 'observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': '', 'nodes_in': ['main'], 'nodes_out': ['main'] }] network = DdpgPolicy(scope="ddpg", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range) target_network = DdpgPolicy(scope="target", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = NormalActionNoise(mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) elif 'ou' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = OrnsteinUhlenbeckActionNoise( mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_range[1] logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(network, target_network, memory, env.observation_space, env.action_space, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() saver = functools.partial(save_variables, sess=sess) loader = functools.partial(load_variables, sess=sess) if load_path != None: loader(load_path) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = env.num_envs n_agents = obs['observation_self'].shape[0] episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch for k, v in action.items(): action[k] *= max_action nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents] } nenvs_actions.append(nenv_action) new_obs, r, done, info = env.step( nenvs_actions ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) if nb_save_epochs != None and (epoch + 1) % nb_save_epochs == 0: if save_dir == None: checkdir = osp.join(logger.get_dir(), 'checkpoints') else: checkdir = osp.join(save_dir, 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % epoch) print('Saving to', savepath) saver(savepath) return agent
class Model: def __init__(self, FLAGS): """ This class build the model that implements the deterministic gradient descent algorithm. :param FLAGS: TensorFlow flags which contain the values for hyperparameters """ self.FLAGS=FLAGS self.env = gym.make('Pendulum-v0') self.state_size = len(self.env.observation_space.sample()) self.num_episodes=1000 self.batch_size=64 self.exp_replay=ExperienceReplay(50000,1500, FLAGS) self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None) self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS) self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS) self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS) self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS) init = tf.global_variables_initializer() self.session = tf.InteractiveSession() self.session.run(init) self.critic.set_session(self.session) self.actor.set_session(self.session) self.actor_target.set_session(self.session) self.critic_target.set_session(self.session) self.critic.init_target_network() self.actor.init_target_network() def train_networks(self): '''Training of the actor and critic networks ''' if len(self.exp_replay.experience['state']) < self.exp_replay.min_experience: return # pick random experience tupels from the expererience replay idx = np.random.choice(len(self.exp_replay.experience['state']), size=self.FLAGS.batch_size, replace=False) state=np.array([self.exp_replay.experience['state'][i] for i in idx]).reshape(self.FLAGS.batch_size,self.state_size) action=np.array([self.exp_replay.experience['action'][i] for i in idx]).reshape(self.FLAGS.batch_size,1) reward=[self.exp_replay.experience['reward'][i] for i in idx] next_state=np.array([self.exp_replay.experience['next_state'][i] for i in idx]).reshape(self.FLAGS.batch_size,self.state_size) dones=[self.exp_replay.experience['done'][i] for i in idx] #Train critic network next_actions=self.actor_target.get_action(next_state) q_next=self.critic.target_network.calculate_Q(next_state, next_actions) targets=np.array([r+self.FLAGS.gamma*q if not done else r for r, q, done in zip(reward,q_next,dones)]) self.critic.train(state, targets, action) #Train actor network current_actions=self.actor.get_action(state) q_gradient=self.critic.compute_gradients(state, current_actions) self.actor.train(state, q_gradient) self.actor.update_target_parameter() self.critic.update_target_parameter() def playEpisode(self,episode): '''Play an episode in the environment ''' #get initial state from the environment state=self.env.reset() state=state.reshape(1,self.state_size) done=False total_reward=0 while not done: #get action for an environment state action=self.actor.get_action(state)+self.action_noise.get_noise(episode) prev_state=state # get new-state, reward, done tuple state, reward, done, _ = self.env.step(action) state=state.reshape(1,self.state_size) #self.env.render(mode='rgb_array') total_reward=total_reward+reward # add <state, action, reward, next-state, done > tuple into the experience replay self.exp_replay.addExperience(prev_state, action, reward, state, done) # start the training self.train_networks() return total_reward def run_model(self): '''Main loop. Runs the environment and traing the networks ''' totalrewards = np.empty(self.num_episodes+1) n_steps=10 for n in range(0, self.num_episodes+1): total_reward=self.playEpisode(n) totalrewards[n]=total_reward if n>0 and n%n_steps==0: print("episodes: %i, avg_reward (last: %i episodes): %.2f" %(n, n_steps, totalrewards[max(0, n-n_steps):(n+1)].mean()))
# Herne prostredie env = gym.make('MountainCarContinuous-v0') # Actor actorNet = Actor.Actor(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_A) actorNet_target = Actor.Actor(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_A) # Critic criticNet = Critic.Critic(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_C) criticNet_target = Critic.Critic(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_C) # replay buffer rpm = ReplayBuffer.ReplayBuffer(1000000) # 1M history noise = OrnsteinUhlenbeckActionNoise(mean=0.0, sigma=0.5, size=env.action_space.shape) # (gradually) replace target network weights with online network weights def replace_weights(tau=wandb.config.tau): theta_a,theta_c = actorNet.model.get_weights(),criticNet.model.get_weights() theta_a_targ,theta_c_targ = actorNet_target.model.get_weights(),criticNet_target.model.get_weights() # mixing factor tau : we gradually shift the weights... theta_a_targ = [theta_a[i]*tau + theta_a_targ[i]*(1-tau) for i in range(len(theta_a))] theta_c_targ = [theta_c[i]*tau + theta_c_targ[i]*(1-tau) for i in range(len(theta_c))] actorNet_target.model.set_weights(theta_a_targ) criticNet_target.model.set_weights(theta_c_targ) def train(verbose=1, batch_size=wandb.config.batch_size, gamma=wandb.config.gamma): # ak je dostatok vzorov k uceniu
def train(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Create global step and increment operation global_step_tensor = tf.Variable(0, trainable=False, name='global_step') increment_global_step = tf.assign_add(global_step_tensor, 1) # Create model saver saver = tf.train.Saver() sess = tf.Session(config=config) if not self.parameters['restore']: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, tf.train.latest_checkpoint('./saves')) self.actor_critic.set_moving_to_target(sess) run_id = np.random.randint(10000) trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id), graph=sess.graph) # Get action noise action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nA), sigma=float(self.parameters['sigma']) * np.ones(self.nA)) # Fill Replay Memory state = self.env.reset() fill_amount = 0 while fill_amount < self.parameters['replay_init_size']: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if done: state = self.env.reset() else: fill_amount += 1 self.memory.add(state, action, reward, done, next_state) state = next_state # Main Loop steps = 0 for i in range(self.parameters['num_epochs']): avg_epoch_rewards = 0 num_epochs = 1 for e in range(self.parameters['num_episodes']): state = self.env.reset() ep_reward = 0 # Perform rollout while True: noise = action_noise() action = self.actor_critic.pi(sess, state[None, ...]) action += noise action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape """ # UNCOMMENT TO PRINT ACTIONS a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])]) trainwriter.add_summary(a0,steps) a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])]) trainwriter.add_summary(a1,steps) a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])]) trainwriter.add_summary(a2,steps) steps += 1 """ next_state, reward, done, _ = self.env.step(action) self.memory.add(state, action, reward, done, next_state) if self.parameters['render_train']: self.env.render() ep_reward += reward if done: reward_summary = tf.Summary(value=[ tf.Summary.Value(tag="ep_rewards", simple_value=ep_reward) ]) trainwriter.add_summary( reward_summary, i * self.parameters['num_episodes'] + e) action_noise.reset() break state = next_state avg_epoch_rewards = avg_epoch_rewards + ( ep_reward - avg_epoch_rewards) / num_epochs num_epochs += 1 # Perform train for t in range(self.parameters['num_train_steps']): s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample( ) # Train actor critic model self.actor_critic.update(sess=sess, filewriter=trainwriter, state_batch=s_state, next_state_batch=s_next_state, action_batch=s_action, reward_batch=s_reward, done_batch=s_terminal) sess.run(increment_global_step) # Print out epoch stats here table_data = [['Epoch', 'Average Reward'], [ str(i) + "/" + str(self.parameters['num_epochs']), str(avg_epoch_rewards) ]] table = AsciiTable(table_data, "Training Run: " + str(run_id)) save_path = saver.save(sess, "./saves/model.ckpt") os.system('clear') print("Model saved in path: %s" % save_path + "\n" + table.table)
def train(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Create global step and increment operation global_step_tensor = tf.Variable(0, trainable=False, name='global_step') increment_global_step = tf.assign_add(global_step_tensor, 1) # Create model saver saver = tf.train.Saver(max_to_keep=None) sess = tf.Session(config=config) if not self.parameters['restore']: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, tf.train.latest_checkpoint('./saves')) self.actor_critic.set_moving_to_target(sess) run_id = np.random.randint(10000) trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id), graph=sess.graph) # Get action noise action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nA), sigma=float(self.parameters['sigma']) * np.ones(self.nA)) # Fill Replay Memory state = self.env.reset() fill_amount = 0 while fill_amount < self.parameters['replay_init_size']: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if done: state = self.env.reset() else: fill_amount += 1 self.memory.add(state, action, reward, done, next_state) state = next_state # Main Loop plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []} plots_dir = './plots/' weights_dir = './weights/' graph_dir = './graph/' if not os.path.exists(plots_dir): os.makedirs(plots_dir) if not os.path.exists(weights_dir): os.makedirs(weights_dir) if not os.path.exists(graph_dir): os.makedirs(graph_dir) saver.export_meta_graph(graph_dir + self.parameters['env'] + '/graph.meta') #cumulative step counter cumu_step = 0 for i in range(self.parameters['num_epochs']): avg_epoch_rewards = 0 n_epochs = 1 for e in range(self.parameters['num_episodes']): state = self.env.reset() ep_reward = 0 ep_n_action = 0 # Perform rollout for _ in range(500): noise = action_noise() action = self.actor_critic.pi(sess, state[None, ...]) action += noise action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) # print(action) # print(next_state) # print(reward) self.memory.add(state, action, reward, done, next_state) if self.parameters['render_train']: self.env.render() ep_reward += reward ep_n_action += 1 cumu_step += 1 state = next_state # Perform train avg_critic_loss = 0.0 avg_actor_loss = 0.0 for t in range(self.parameters['num_train_steps']): s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample( ) # Train actor critic model _, _, critic_loss, actor_loss = self.actor_critic.update( sess=sess, filewriter=trainwriter, state_batch=s_state, next_state_batch=s_next_state, action_batch=s_action, reward_batch=s_reward, done_batch=s_terminal) avg_critic_loss += critic_loss avg_actor_loss += actor_loss sess.run(increment_global_step) avg_critic_loss /= self.parameters['num_train_steps'] avg_actor_loss /= self.parameters['num_train_steps'] if done: reward_summary = tf.Summary(value=[ tf.Summary.Value(tag="ep_rewards", simple_value=ep_reward) ]) trainwriter.add_summary( reward_summary, i * self.parameters['num_episodes'] + e) action_noise.reset() break avg_epoch_rewards = avg_epoch_rewards + ( ep_reward - avg_epoch_rewards) / n_epochs n_epochs += 1 print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\ .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step)) if e % 19 == 0: save_path = saver.save( sess, weights_dir + self.parameters['env'] + '/model.ckpt', global_step=i * e + 1) plots['episode_reward'].append(ep_reward) plots['critic_loss'].append(critic_loss) plots['actor_loss'].append(critic_loss) pickle.dump( plots, open(plots_dir + self.parameters['env'] + '_plot.pickle', 'wb'))
def main(args): with tf.Session() as sess: env = gym.make(args['env']) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric # assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) if args['train']: if not os.path.exists(args['save_dir']): os.makedirs(args['save_dir']) with open(os.path.join(args['save_dir'], 'config.json'), 'w') as f: json.dump(args, f, indent=2) train(sess, env, args, actor, critic, actor_noise) else: # ddpg = [] # indexes = [e for e in range(400) if e % 10 == 9] # indexes = [0] + indexes indexes = [399] num_test_tasks = 100 buckets = 1 successes = [] directory = args['to_pickle'] for index in indexes: # times = [] task_success = [] saver = tf.train.Saver() saver.restore( sess, "../final_models/multitask/fixed/{0}/model-{1}.ckpt". format(directory, index)) for _ in range(buckets): tasks = env.unwrapped.sample_tasks(num_test_tasks) # tasks = [{'goal': np.array([0., 0.])} for e in range(num_test_tasks)] success = 0 for task in tasks: s = env.reset_task(task) step = 0 d = False while not d: # env.render() action = actor.predict_target( np.reshape(s, (1, actor.s_dim)))[0] step += 1 s, r, d, _ = env.step(action) if r == 1: success += 1 # times.append(step) env.close() task_success.append(success / num_test_tasks) successes.append(task_success) # ddpg.append(times) # out = [successes, ddpg] env.close() if not os.path.exists('./pkls'): os.makedirs('./pkls') with open('./pkls/{0}.pkl'.format(args['save_dir']), 'wb') as f: pickle.dump(successes, f)
def main(args): params = '_delta_'+str(args['delta'])+\ '_wrapper_'+str(args['wrapper'])+\ '_hindsight_'+str(args['with_hindsight']) logdir = args['summary_dir'] final_dir = logdir+'/'+params+'/'+datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") logger_step = Logger(dir=final_dir+'/log_step',format_strs=['json', 'tensorboard']) logger_episode = Logger(dir=final_dir+'/log_episodes', format_strs=['stdout', 'json', 'tensorboard']) actor_lr = float(args['actor_lr']) tau = float(args['tau']) critic_lr = float(args['critic_lr']) gamma = float(args['gamma']) batch_size = int(args['minibatch_size']) eval_episodes = int(args['eval_episodes']) max_episode_steps = int(args['max_episode_steps']) max_steps = int(args['max_steps']) eval_freq = int(args['eval_freq']) train_env = gym.make(args['env']) test_env = gym.make(args['env']) if args['wrapper'] == 'NoGoal': env_wrapper = NoGoal() elif args['wrapper'] == 'RandomGoal': env_wrapper = RandomGoal() elif args['wrapper'] == 'HandCurri': env_wrapper = HandmadeCurriculum() else: print("Nooooooooooooooooooooo") state_dim = env_wrapper.state_shape[0] action_dim = env_wrapper.action_shape[0] action_bound = train_env.action_space.high # Ensure action bound is symmetric assert (train_env.action_space.high == -train_env.action_space.low) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) # Initialize replay memory if args['with_hindsight']: memory = HerMemory(env_wrapper, with_reward=True, limit=int(1e6), strategy='last') else: memory = Memory(env_wrapper, with_reward=True, limit=int(1e6)) with tf.Session() as sess: if args['random_seed'] is not None: np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) train_env.seed(int(args['random_seed'])) test_env.seed(int(args['random_seed'])) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, tau, actor_lr) critic = CriticNetwork(sess, state_dim, action_dim, gamma, tau, critic_lr) agent = DDPG_agent(sess, actor, actor_noise, critic, train_env, test_env, env_wrapper, memory, logger_step, logger_episode, batch_size, eval_episodes, max_episode_steps, max_steps, eval_freq) agent.run()
def train(self): with tf.Session(graph=self.graph) as sess: self._load_model(sess, self.params.load_model) self.total_episodes = self.params.total_episodes # Obtain an initial observation of the environment state = self.env.reset() state_input = state.reshape([1, self.params.input_dim]) for episode_number in xrange(self.params.total_episodes): done = False score = 0 while not done: if self.global_step > self.params.preTrainStep: # Value network update trainBatch = self.myBuffer.sample( self.params.batch_size) batch_state = np.array(trainBatch[0]).reshape( [self.params.batch_size, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape( [self.params.batch_size, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape( [self.params.batch_size, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) target_action = sess.run(self.target_actor.det_prob, feed_dict={ self.target_actor.input_x: batch_next_state }) target_action = np.array([[1, 0] if i == 0 else [0, 1] for i in target_action]) targetQ_all = sess.run(self.target_critic.Qout, feed_dict={ self.target_critic.input_x: batch_next_state, self.target_critic.actions: target_action }) nextQ = np.sum(np.multiply(targetQ_all, target_action), axis=-1) targetQ = batch_rewards + (self.params.gamma * nextQ * end_multiplier) pred_actions = sess.run( self.main_actor.det_prob, feed_dict={self.main_actor.input_x: batch_state}) pred_actions = np.array([[1, 0] if i == 0 else [0, 1] for i in pred_actions]) # Update the network with our target values. sess.run(self.main_critic.update_value_model, feed_dict={ self.main_critic.input_x: batch_state, self.main_critic.target_Q: targetQ, self.main_critic.actions: batch_actions }) self.update_Target(self.critic_targetOps, sess) gradients = sess.run(self.main_critic.action_grads, feed_dict={ self.main_critic.input_x: batch_state, self.main_critic.actions: pred_actions }) gradients = np.array(gradients).reshape( self.params.batch_size, self.params.num_actions) sess.run(self.main_actor.optimize, feed_dict={ self.main_actor.input_x: batch_state, self.main_actor.action_gradient: gradients }) self.update_Target(self.actor_targetOps, sess) # Make sure the observation is in a shape the network can handle. state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] actor_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.params.num_actions)) action = sess.run(self.main_actor.logits, feed_dict={ self.main_actor.input_x: state_input }) + actor_noise() action = np.argmax(action) # step the environment and get new measurements next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append( reward if not done or score == 299 else -100) #reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(done) # move to next state state_input = next_state # add up reward self.reward_sum += reward score += reward self.global_step += 1 self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) if episode_number % self.params.update_freq == 0: self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01 print( 'Current Episode {} Average reward for episode {:.2f}. Total average reward {:.2f}.' .format(episode_number, self.reward_sum // self.params.update_freq, self.running_reward // self.params.update_freq)) self.reward_sum = 0 time.sleep(0.5) self.state = self.env.reset() state_input = self.state.reshape([1, self.params.input_dim]) self.global_step += 1
def run(env_id, seed, noise_type, layer_norm, evaluation, memory_limit, **kwargs): rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) print("rank: %d" % (rank)) env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) enal_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), "gym_eval")) env = bench.Monitor(env, None) else: eval_env = None action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(","): current_noise_type = current_noise_type.strip() if current_noise_type == "none": pass elif "adaptive-param" in current_noise_type: _, stddev = current_noise_type.split("_") param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif "normal" in current_noise_type: _, stddev = current_noise_type.split("_") action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif "ou" in current_noise_type.split("_"): _, stddev = current_noise_type.split("_") action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) print(type(memory_limit), memory_limit) memory = Memory(limit=int(memory_limit), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) seed = seed + 1000000 * rank logger.info("rank {} : seed={}, logdir={}".format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) if rank == 0: start_time = time.time() if option == 1: training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) elif option == 2: training_reward_shaping.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info("total runtime: {}s".format(time.time() - start_time))
class DDPGAgent(object): def __init__(self, state_size, action_size, action_bound_high, action_bound_low, imitation_data_path): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.state_size = state_size self.action_bound_high = torch.Tensor([action_bound_high]).to(device) self.action_bound_low = torch.Tensor([action_bound_low]).to(device) self.action_size = action_size self.buffer = EfficientReplayMemory(Parameters.BUFFER_SIZE, self.state_size, self.action_size) self.imitation_buffer = EfficientReplayMemory( Parameters.IMITATION_BUFFER_SIZE, self.state_size, self.action_size) self.imitation_buffer.load_memory(imitation_data_path) self.imitation_lambda = Parameters.IMITATION_LAMBDA # Actor self.policy_function = Policy(self.state_size, self.action_size, self.action_bound_high) self.policy_function_target = Policy(self.state_size, self.action_size, self.action_bound_high) self.policy_function_noisy = Policy(self.state_size, self.action_size, self.action_bound_high) self.policy_function_optim = Adam(self.policy_function.parameters(), lr=Parameters.ACTOR_LEARNING_RATE) self.imitation_optimizer = Adam(self.policy_function.parameters(), lr=self.imitation_lambda) # critic 1 (q-value) self.q_function = QFunction(self.state_size, self.action_size) self.q_function_target = QFunction(self.state_size, self.action_size) self.q_function_optim = Adam(self.q_function.parameters(), lr=Parameters.CRITIC_LEARNING_RATE) # Noise parameters self.action_noise = OrnsteinUhlenbeckActionNoise(self.action_size) self.desired_action_std = Parameters.DESIRED_ACTION_STD self.current_noise_std = Parameters.INITIAL_NOISE_STD self.coefficient = Parameters.ADAPT_COEFFICIENT # hyperparameters self.gamma = Parameters.GAMMA self.tau = Parameters.TAU self.hard_update_network(self.policy_function_target, self.policy_function) self.hard_update_network(self.q_function_target, self.q_function) def soft_update_network(self, target, source): for target_parameters, source_parameters in zip( target.parameters(), source.parameters()): target_parameters.data.copy_(target_parameters.data * (1.0 - self.tau) + source_parameters.data * self.tau) def hard_update_network(self, target, source): target.load_state_dict(source.state_dict()) def chose_action(self, state, exploration=True): self.policy_function.eval() if exploration and Parameters.PARAMETER_NOISE: action = self.policy_function_noisy((Variable(state))) else: action = self.policy_function((Variable(state))) self.policy_function.train() action = action.data if self.action_noise is not None and exploration: action += torch.Tensor(self.action_noise.sample()) return action.clamp(-1, 1) def store_buffer_transition(self, state, action, mask, next_state, reward): self.buffer.push(state, action, reward, next_state, mask) def smooth_l1_loss(self, input, target, beta=1, size_average=True): """ very similar to the smooth_l1_loss from pytorch because current pytorch variant is buggy """ n = torch.abs(input - target) cond = n < beta loss = torch.where(cond, 0.5 * n**2 / beta, n - 0.5 * beta) if size_average: return loss.mean() return loss.sum() def train(self): # sample batch and train state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.buffer.sample( Parameters.BATCH_SIZE) loss_imitation = 0 state_batch = Variable(state_batch) action_batch = Variable(action_batch) reward_batch = Variable(reward_batch) mask_batch = Variable(mask_batch) next_state_batch = Variable(next_state_batch) # train the critic (Q-function) next_actions = self.policy_function_target(next_state_batch) next_q_values = self.q_function_target(next_state_batch, next_actions) expected_q_values = reward_batch + (self.gamma * mask_batch * next_q_values) self.q_function_optim.zero_grad() predicted_q_values = self.q_function(state_batch, action_batch) #q_value_loss = F.smooth_l1_loss(predicted_q_values, expected_q_values) #q_value_loss = F.smooth_l1_loss(expected_q_values, predicted_q_values) q_value_loss = self.smooth_l1_loss(expected_q_values, predicted_q_values) #q_value_loss = (predicted_q_values - expected_q_values).pow(2).mean() q_value_loss.backward() self.q_function_optim.step() # train the policy self.policy_function_optim.zero_grad() q_value_prediction = self.q_function(state_batch, self.policy_function(state_batch)) # maximize the Q value for the chosen action policy_loss = -q_value_prediction policy_loss = policy_loss.mean() policy_loss.backward() self.policy_function_optim.step() if Parameters.USE_IMITATION_LEARNING: state_batch_imitation, action_batch_imitation, _, _, _ = self.imitation_buffer.sample( Parameters.IMITATION_BATCH_SIZE) action_batch_imitation = Variable(action_batch_imitation, requires_grad=True) state_batch_imitation = Variable(state_batch_imitation, requires_grad=True) predicted_actions = self.chose_action(state_batch_imitation, False) q_value_prediction = self.q_function(state_batch_imitation, predicted_actions) q_value_imitation = self.q_function(state_batch_imitation, action_batch_imitation) # Only try to learn the actions that were actually better than the current policy imitation_mask = (q_value_imitation > q_value_prediction) self.imitation_optimizer.zero_grad() loss_imitation = ((predicted_actions - action_batch_imitation) * imitation_mask.float()).pow(2).mean() loss_imitation.backward() self.imitation_optimizer.step() # update the target networks self.update_networks() return q_value_loss.item(), policy_loss.item() def update_networks(self): self.soft_update_network(self.policy_function_target, self.policy_function) self.soft_update_network(self.q_function_target, self.q_function) def noise_actor_parameters(self): """ Apply dynamic noise to the actor network PARAMETERS for better exploration. See: https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py https://blog.openai.com/better-exploration-with-parameter-noise/ """ self.hard_update_network(self.policy_function_noisy, self.policy_function) params = self.policy_function_noisy.state_dict() for key in params: if 'ln' in key: pass param = params[key] param += (torch.randn(param.shape) * self.current_noise_std).to( self.policy_function_noisy.device) def adapt_parameter_noise(self, states, actions): """ Adapt the rate of noise dynamically according to a specified target. See: https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py https://blog.openai.com/better-exploration-with-parameter-noise/ """ states = torch.cat(states, 0) unperturbed_actions = self.chose_action(states, False) perturbed_actions = torch.cat(actions, 0) # calculate euclidian distance of both actions: mean_diff = np.mean(np.square( (perturbed_actions - unperturbed_actions).numpy()), axis=0) distance = sqrt(np.mean(mean_diff)) # adapt the standard deviation of the parameter noise if distance > self.desired_action_std: self.current_noise_std /= self.coefficient else: self.current_noise_std *= self.coefficient def save_models(self, path="./"): torch.save(self.policy_function.state_dict(), path + "actor.pt") torch.save(self.q_function.state_dict(), path + "critic.pt") print("Models saved successfully") def load_models(self, path="./"): if isfile(path + "actor.pt"): self.policy_function.load_state_dict(torch.load(path + "actor.pt")) self.q_function.load_state_dict(torch.load("critic.pt")) self.policy_function_target.load_state_dict( self.policy_function.state_dict()) self.q_function_target.load_state_dict( self.q_function.state_dict()) print("Models loaded succesfully") else: print("No model to load")
def __init__(self, session, state_shape, action_shape, action_bound, learning_rate, tau, loss_mask=True): """ Initialize actor and target networks and update methods. """ self.session = session self.state_shape = state_shape self.action_shape = action_shape self.action_bound = action_bound self.learning_rate = learning_rate self.tau = tau self.hidden_1_size = 400 self.hidden_2_size = 300 self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_act') self.trace_length = tf.placeholder(tf.int32, name='trace_act') self.phase = tf.placeholder(tf.bool, name='phase_act') # Initialize addititve Ornstein Uhlenbeck noise self.OU_noise = \ OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_shape)) # Initialize actor network self.inputs, self.out, self.scaled_out, self.lstm_state, \ self.lstm_init_state = self.create_actor_network() self.network_params = tf.trainable_variables() # Initialize target actor network self.target_inputs, self.target_out, self.target_scaled_out, \ self.target_lstm_state, self.target_lstm_init_state = \ self.create_actor_network(prefix='tar_') self.target_network_params = \ tf.trainable_variables()[len(self.network_params):] # Define target update op self.update_target_network_params = \ [self.target_network_params[i].assign( tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1.0 - self.tau)) for i in range(len(self.target_network_params))] # Define ops for getting necessary gradients self.action_gradient = \ tf.placeholder(tf.float32, [None, self.action_shape]) if loss_mask: # Mask first half of losses for each trace per Lample & Charlot 2016 self.maskA = tf.zeros([self.batch_size, self.trace_length // 2]) self.maskB = tf.ones([self.batch_size, self.trace_length // 2]) self.mask = tf.concat([self.maskA, self.maskB], 1) self.mask = tf.reshape(self.mask, [-1]) self.action_gradient_adjusted = self.action_gradient * self.mask else: self.action_gradient_adjusted = self.action_gradient self.unnormalized_actor_gradients = tf.gradients( self.scaled_out, self.network_params, -self.action_gradient_adjusted) self.actor_gradients = list( map(lambda x: tf.div(x, tf.cast(self.batch_size, tf.float32)), self.unnormalized_actor_gradients)) # Define optimization op # TODO: Only update BN params when needed instead of all the time! # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # with tf.control_dependencies(update_ops): self.optimize = tf.train.AdamOptimizer(self.learning_rate).\ apply_gradients(zip(self.actor_gradients, self.network_params)) self.num_trainable_vars = \ len(self.network_params) + len(self.target_network_params)
class DDPG(object): def __init__(self, gamma, tau,num_inputs, env,device, results_path=None): self.gamma = gamma self.tau = tau self.min_action,self.max_action = env.action_range() self.device = device self.num_actions = env.action_space() self.noise_stddev = 0.3 self.results_path = results_path self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/') os.makedirs(self.checkpoint_path, exist_ok=True) # Define the actor self.actor = Actor(num_inputs, self.num_actions).to(device) self.actor_target = Actor(num_inputs, self.num_actions).to(device) # Define the critic self.critic = Critic(num_inputs, self.num_actions).to(device) self.critic_target = Critic(num_inputs, self.num_actions).to(device) # Define the optimizers for both networks self.actor_optimizer = Adam(self.actor.parameters(), lr=1e-4 ) # optimizer for the actor network self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4, weight_decay=0.002) # optimizer for the critic network self.hard_swap() self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions), sigma=float(self.noise_stddev) * np.ones(self.num_actions)) self.ou_noise.reset() def eval_mode(self): self.actor.eval() self.actor_target.eval() self.critic_target.eval() self.critic.eval() def train_mode(self): self.actor.train() self.actor_target.train() self.critic_target.train() self.critic.train() def get_action(self, state, episode, action_noise=True): x = state.to(self.device) # Get the continous action value to perform in the env self.actor.eval() # Sets the actor in evaluation mode mu = self.actor(x) self.actor.train() # Sets the actor in training mode mu = mu.data # During training we add noise for exploration if action_noise: noise = torch.Tensor(self.ou_noise.noise()).to(self.device) * 1.0/(1.0 + 0.1*episode) noise = noise.clamp(0,0.1) mu = mu + noise # Add exploration noise ε ~ p(ε) to the action. Do not use OU noise (https://spinningup.openai.com/en/latest/algorithms/ddpg.html) # Clip the output according to the action space of the env mu = mu.clamp(self.min_action,self.max_action) return mu def update_params(self, batch): # Get tensors from the batch state_batch = torch.cat(batch.state).to(self.device) action_batch = torch.cat(batch.action).to(self.device) reward_batch = torch.cat(batch.reward).to(self.device) done_batch = torch.cat(batch.done).to(self.device) next_state_batch = torch.cat(batch.next_state).to(self.device) # Get the actions and the state values to compute the targets next_action_batch = self.actor_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch.detach()) # Compute the target reward_batch = reward_batch.unsqueeze(1) done_batch = done_batch.unsqueeze(1) expected_values = reward_batch + (1.0 - done_batch) * self.gamma * next_state_action_values # Update the critic network self.critic_optimizer.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_values.detach()) value_loss.backward() self.critic_optimizer.step() # Update the actor network self.actor_optimizer.zero_grad() policy_loss = -self.critic(state_batch, self.actor(state_batch)) policy_loss = policy_loss.mean() policy_loss.backward() for param in self.actor.parameters(): param.grad.data.clamp_(-1, 1) self.actor_optimizer.step() # Update the target networks soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.item(), policy_loss.item() def hard_swap(self): # Make sure both targets are with the same weight hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) def store_model(self): print("Storing model at: ", self.checkpoint_path) checkpoint = { 'actor': self.actor.state_dict(), 'actor_optim': self.actor_optimizer.state_dict(), 'critic': self.critic.state_dict(), 'criti_optim': self.critic_optimizer.state_dict() } torch.save(checkpoint, os.path.join(self.checkpoint_path, 'checkpoint.pth') ) def load_model(self): files = os.listdir(self.checkpoint_path) if files: print("Loading models checkpoints!") model_dicts = torch.load(os.path.join(self.checkpoint_path, 'checkpoint.pth'),map_location=self.device) self.actor.load_state_dict(model_dicts['actor']) self.actor_optimizer.load_state_dict(model_dicts['actor_optim']) self.critic.load_state_dict(model_dicts['critic']) self.critic_optimizer.load_state_dict(model_dicts['criti_optim']) else: print("Checkpoints not found!")
def standalone_headless_isolated(pq, cq, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...', pq, cq) try: import traceback from osim.env import RunEnv e = RunEnv(visualize=False, max_obstacles=0) # bind_alternative_pelvis_judgement(e) # use_alternative_episode_length(e) except Exception as e: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') # conn.send(('error',e)) # conn.put(('error',e)) cq.put(('error', e)) def floatify(n_p): return [float(n_p[i]) for i in range(len(n_p))] try: previous_o = None nb_actions = e.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=0.3 * np.ones(nb_actions)) while True: # msg = conn.recv() # msg = conn.get() msg = pq.get() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': #or (previous_o==None and msg[0]=='step'): o = e.reset(difficulty=0) o = floatify(o) o_processed = generate_observation(o, o) previous_o = o cq.put(o_processed) elif msg[0] == 'step': actions = msg[1] noisy_action = np.array(actions) + action_noise() o, r, d, i = e.step(noisy_action) o = floatify(o) # floatify the observation o_processed = generate_observation(o, previous_o) previous_o = o cq.put((o_processed, r, d, i)) elif msg[0] == 'action_space': a_s = e.action_space r_a_s = (a_s.low.tolist(), a_s.high.tolist(), a_s.shape) cq.put(r_a_s) elif msg[0] == 'observation_space': o_s = get_observation_space() r_o_s = (o_s['low'].tolist(), o_s['high'].tolist(), o_s['shape']) cq.put(r_o_s) else: cq.close() pq.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
def learn( env, seed=None, total_timesteps=1e6, nb_epochs=None, # with default settings, perform 1M steps total nb_rollout_steps=100, max_ep_len=250, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, start_steps=10000, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, nb_log_steps=None, nb_save_steps=None, batch_size=64, # per MPI worker polyak=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), target_noise=0.2, noise_clip=0.5, policy_delay=2, eval_env=None, load_path=None, save_dir=None, **network_kwargs): set_global_seeds(seed) if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 memory = Memory(limit=int(1e6)) network_spec = [{ 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': 'tanh', 'nodes_in': ['main'], 'nodes_out': ['main'] }] vnetwork_spec = [{ 'layer_type': 'concat', 'nodes_in': ['action_movement', 'observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': '', 'nodes_in': ['main'], 'nodes_out': ['main'] }] network = Td3Policy(scope="td3", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, action_range=action_range, target_noise=target_noise, noise_clip=noise_clip) target_network = Td3Policy(scope="target", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, action_range=action_range, target_noise=target_noise, noise_clip=noise_clip, isTarget=True) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = NormalActionNoise(mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) elif 'ou' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = OrnsteinUhlenbeckActionNoise( mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_range[1] logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = TD3(env, network, target_network, memory, env.action_space, env.observation_space, steps_per_epoch=nb_rollout_steps, epochs=nb_epochs, gamma=gamma, polyak=polyak, actor_lr=actor_lr, critic_lr=critic_lr, batch_size=batch_size, start_steps=start_steps, action_noise=action_noise, target_noise=target_noise, noise_clip=noise_clip, policy_delay=policy_delay) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() saver = functools.partial(save_variables, sess=sess) loader = functools.partial(load_variables, sess=sess) if load_path != None: loader(load_path) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = env.num_envs n_agents = obs['observation_self'].shape[0] episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for t in range(int(total_timesteps)): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents] } nenvs_actions.append(nenv_action) else: action, q = env.action_space.sample(), None nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents][0] } nenvs_actions.append(nenv_action) new_obs, r, done, info = env.step(nenvs_actions) episode_reward += r episode_step += 1 for d in range(len(done)): done[d] = False if episode_step == max_ep_len else done[d] epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() episode_actor_losses = [] episode_critic_losses = [] episode_critic = [] episode_critic_twin = [] if d or (episode_step[0] == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(episode_step[0]): critic_loss, critic, critic_twin, actor_loss = agent.train( episode_step[0]) episode_critic_losses.append(critic_loss) episode_critic.append(critic) episode_critic_twin.append(critic_twin) if actor_loss is not None: episode_actor_losses.append(actor_loss) obs, r, done, episode_reward, episode_step = env.reset( ), 0, False, np.zeros((nenvs, n_agents), dtype=np.float32), np.zeros(nenvs, dtype=int) if (t + 1) % nb_log_steps == 0: # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['train/loss_actor'] = np.mean(episode_actor_losses) combined_stats['train/loss_critic'] = np.mean( episode_critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) if nb_save_steps != None and (t + 1) % nb_save_steps == 0: if save_dir == None: checkdir = osp.join(logger.get_dir(), 'checkpoints') else: checkdir = osp.join(save_dir, 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % t) print('Saving to', savepath) saver(savepath) return agent
def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda()