Python ReplayBuffer.make_index Exemples

Langage de programmation: Python

Espace de nommage/Pack: replay_buffer

Class/Type: ReplayBuffer

Méthode/Fonction: make_index

Exemples au hotexamples.com: 2

Python ReplayBuffer.make_index - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de replay_buffer.ReplayBuffer.make_index extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

ReplayBuffer(30)

add(30)

count(26)

get_batch(25)

append(17)

encode_recent_observation(7)

getBatch(7)

add_sample(7)

insert(7)

clear(6)

load(5)

load_memory(5)

can_sample(4)

add_episode(4)

add_transition(4)

get_last_steps(3)

get_size(3)

create_batch(3)

is_ready(3)

add_experience(3)

add_to_memory(2)

make_index(2)

compute_values(2)

compute_value_difference(2)

compute_reward_distances(2)

compute_returns(2)

compute_lambda_returns(2)

fetch_sample(2)

compute_episode_boundaries(2)

encoder_recent_observation(2)

idx(2)

get_batch_data(1)

get_current_size(1)

get_experiences(1)

get_len(1)

get_current_state(1)

load_replay(1)

get_losses_offline(1)

get_memory(1)

get_minibatch(1)

importance_sampling(1)

in_order_iterate(1)

insert_sample(1)

k(1)

length(1)

lenth(1)

load_from_file(1)

get_random_minibatch(1)

dump(1)

get(1)

Méthodes fréquemment utilisées

ReplayBuffer (30)

add (30)

count (26)

get_batch (25)

append (17)

encode_recent_observation (7)

getBatch (7)

add_sample (7)

insert (7)

clear (6)

Méthodes fréquemment utilisées

load (5)

load_memory (5)

can_sample (4)

add_episode (4)

add_transition (4)

get_last_steps (3)

get_size (3)

create_batch (3)

is_ready (3)

add_experience (3)

add_to_memory (2)

make_index (2)

compute_values (2)

compute_value_difference (2)

compute_reward_distances (2)

compute_returns (2)

compute_lambda_returns (2)

fetch_sample (2)

compute_episode_boundaries (2)

encoder_recent_observation (2)

Méthodes fréquemment utilisées

add_to_memory (2)

make_index (2)

compute_values (2)

compute_value_difference (2)

compute_reward_distances (2)

compute_returns (2)

compute_lambda_returns (2)

fetch_sample (2)

compute_episode_boundaries (2)

encoder_recent_observation (2)

idx (2)

get_batch_data (1)

get_current_size (1)

get_experiences (1)

get_len (1)

get_current_state (1)

load_replay (1)

get_losses_offline (1)

get_memory (1)

get_minibatch (1)

importance_sampling (1)

in_order_iterate (1)

insert_sample (1)

k (1)

length (1)

lenth (1)

load_from_file (1)

get_random_minibatch (1)

dump (1)

get (1)

Méthodes fréquemment utilisées

idx (2)

get_batch_data (1)

get_current_size (1)

get_experiences (1)

get_len (1)

get_current_state (1)

load_replay (1)

get_losses_offline (1)

get_memory (1)

get_minibatch (1)

importance_sampling (1)

in_order_iterate (1)

insert_sample (1)

k (1)

length (1)

lenth (1)

load_from_file (1)

get_random_minibatch (1)

dump (1)

get (1)

batch_load (1)

_count (1)

_ptr (1)

addAbsorbing (1)

add_batch (1)

add_data (1)

add_datapoints (1)

add_effects (1)

add_errors (1)

add_expert (1)

add_tuples (1)

adds (1)

as_dataset (1)

buffer_init (1)

full (1)

buffered (1)

cache (1)

clear_buffer (1)

create_episode (1)

current_count (1)

Exemple #1

0

Afficher le fichier

Fichier : I3.py Projet : hcch0912/I3

class I3QLearner(): def __init__(self, num_features, num_actions, timestep, action_space, scope): self.scope = scope self._lr = 0.5 self.discount = 1. self.replay_buffer = ReplayBuffer(1e4) with tf.variable_scope(self.scope): self.act_trajectory = tf.placeholder(tf.float32, shape = ((None, timestep, action_space))) self.target = tf.placeholder(tf.float32, shape = ((None, ))) self.act = tf.placeholder(tf.int32, shape = ((None,))) self.tau = lstm_model(self.act_trajectory, num_actions, scope = "tau_model_{}".format(scope)) self.q_input = self.tau #train network self.q = mlp_model(self.q_input, 2, scope = "q_model_{}".format(scope)) q_func_vars = U.scope_vars(U.absolute_scope_name( "q_model_{}".format(scope))) #target network self.target_q = mlp_model(self.q_input, 2, scope = "target_q_model_{}".format(scope)) target_q_func_vars = U.scope_vars(U.absolute_scope_name( "target_q_model_{}".format(scope))) # take action self.softmax = tf.nn.softmax(self.target_q) self.pred = tf.argmax(self.softmax, axis = 1) #calculate the loss self.q_t_selected = tf.reduce_mean(self.q * tf.one_hot(self.act, num_actions), 1) q_tp1_best = tf.reduce_max(self.q, 1) q_tp1_best_masked = q_tp1_best td_error = self.q_t_selected - tf.stop_gradient(self.target) self.errors = U.huber_loss(td_error) self.q_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.errors, var_list = q_func_vars) self.tau_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.tau, labels=self.act)) self.tau_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.tau_loss) self.get_pred = U.function(inputs = [self.act_trajectory] , outputs = [self.softmax]) self.train_q = U.function(inputs = [self.act_trajectory] + [self.target] +[self.act] , outputs = [self.errors, self.q], updates = [self.q_opt_op]) self.train_tau = U.function(inputs =[ self.act] + [self.act_trajectory], outputs = [self.tau_loss], updates =[ self.tau_opt_op ]) self.update_model = make_update_exp(q_func_vars, target_q_func_vars) def experience(self, action1, act_tra1 , reward1): self.replay_buffer.add(action1, act_tra1 , reward1) # bolzman exploration policy, set the temperature parmeter = 1 for default def get_act(self, act_trajectory): acpd = self.get_pred(act_trajectory)[0][0] # action = np.random.choice([0,1], p = acpd) action = epsilon_greedy(acpd, 0.1) return action def supervise_tau(self, a_next, action_trajectory): loss = self.train_tau(*([a_next] + [action_trajectory]))[0] return loss def update_target(self): self.update_model() def learn(self, batch_size): replay_sample_index = self.replay_buffer.make_index(batch_size) act, act_tra, reward = self.replay_buffer.sample_index(replay_sample_index) loss , q= self.train_q(*([act_tra] + [reward] + [act])) return loss, q

Exemple #2

0

Afficher le fichier

class Agent: def __init__(self, pos, actor, critic, actor_target, critic_target, train_mode, discrete_action, args, alg_mode='MADDPG'): self.pos = pos self.BATCH_SIZE = args.batch_size self.GAMMA = args.GAMMA self.args = args self.train_mode = train_mode self.discrete_action = discrete_action self.algorithm = alg_mode self.critic = critic self.critic_target = critic_target self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(5, )) self.actor = actor self.actor_target = actor_target self.actor_target.hard_copy(actor) self.critic_target.hard_copy(critic) self.replay_buffer = ReplayBuffer(int(1e6)) self.max_replay_buffer_len = self.BATCH_SIZE * 25 def preupdate(self): self.replay_sample_index = None def step(self, agents, t, terminal): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index(self.BATCH_SIZE) obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for agent in agents: obs, act, rew, obs_next, done = agent.replay_buffer.sample_index(index) obs_n.append(torch.FloatTensor(obs).to(device)) obs_next_n.append(torch.FloatTensor(obs_next).to(device)) act_n.append(torch.FloatTensor(act).to(device)) state_batch, action_batch, reward_batch, state_next_batch, t_batch = self.replay_buffer.sample_index(index) state_batch = torch.FloatTensor(state_batch).to(device) action_batch = torch.FloatTensor(action_batch).to(device) reward_batch = torch.FloatTensor(reward_batch).to(device) t_batch = torch.FloatTensor(t_batch).to(device) state_next_batch = torch.FloatTensor(state_next_batch).to(device) reward_batch = torch.reshape(reward_batch, (1024, 1)) t_batch = torch.reshape(t_batch, (1024, 1)) # Train the critic network. if self.algorithm == 'MADDPG': if self.discrete_action: target_actions = [onehot_from_logits(agent.actor_target(nobs)) for agent, nobs in zip(agents, obs_next_n)] else: target_actions = [agent.actor_target(nobs) for agent, nobs in zip(agents, obs_next_n)] obs_next_concat = torch.cat(obs_next_n, dim=-1) target_actions = torch.cat(target_actions, dim=-1) else: # Get actions in DDPG mode. if self.discrete_action: target_actions = onehot_from_logits(self.actor_target(state_next_batch)) else: target_actions = self.actor_target(state_next_batch) obs_next_concat = state_next_batch predicted_q_value = self.critic_target(obs_next_concat, target_actions) Q_targets = reward_batch + ((1 - t_batch) * self.GAMMA * predicted_q_value).detach() if self.algorithm == 'MADDPG': obs_concat = torch.cat(obs_n, dim=-1) action_concat = torch.cat(act_n, dim=-1) else: obs_concat = state_batch action_concat = action_batch self.critic.train_step(obs_concat, action_concat, Q_targets) all_actions = [] if self.discrete_action: curr_pol_out = self.actor(state_batch) curr_pol_vf_in = gumbel_softmax(curr_pol_out, hard=True) else: curr_pol_out = self.actor(state_batch) curr_pol_vf_in = curr_pol_out if self.algorithm == 'MADDPG': # Get the actions of all actors in MADDPG mode. for i, agent, obs in zip(range(len(agents)), agents, obs_n): if i == self.pos: all_actions.append(curr_pol_vf_in) elif self.discrete_action: all_actions.append(onehot_from_logits(agent.actor(obs))) else: all_actions.append(agent.actor(obs)) actions_concatenated = torch.cat(all_actions, dim=-1) else: # Get ONLY the action of the current actor in DDPG. actions_concatenated = curr_pol_vf_in self.actor.train_step(self.critic, obs_concat, actions_concatenated, curr_pol_out) self.soft_update(self.actor, self.actor_target, tau=self.args.tau) self.soft_update(self.critic, self.critic_target, tau=self.args.tau) def experience(self, obs, act, rew, new_obs, done): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, done) def act(self, state, add_noise=False): """Returns actions for given state as per current policy.""" state = torch.FloatTensor(state).unsqueeze(0).to(device) noise = self.noise() noise = torch.FloatTensor(noise).unsqueeze(0).to(device) action = self.actor(state) if self.discrete_action: if add_noise: action = gumbel_softmax(action, hard=True) else: action = onehot_from_logits(action) else: if add_noise: action = action + noise action = action.clamp(-1, 1) action = action.cpu().detach().numpy()[0] return action def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)