def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1, lambda_2, lambda_3, n_steps, l_margin): # Input Parameters self.eps = eps # eps-greedy self.gamma = gamma # discount factor self.batch_size = batch_size self.tau = tau # frequency of target replacement self.ed = 0.005 # bonus for demonstration # todo they aren't used self.ea = 0.001 # todo they aren't used self.l_margin = l_margin self.n_steps = n_steps self.lambda1 = lambda_1 # n-step return self.lambda2 = lambda_2 # supervised loss self.lambda3 = lambda_3 # L2 self.counter = 0 # target replacement counter # todo change to iter_counter self.replay = Memory(capacity=max_memory) self.loss = nn.MSELoss() self.policy = Policy() # todo change not have to pass architecture self.opt = optim.Adam(self.policy.predictNet.parameters(), lr=lr, weight_decay=lambda_3) self.replay.e = 0 self.demoReplay = ddict(list) self.noisy = hasattr(self.policy.predictNet, "sample")
def __init__(self): self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet( ACTIONS_SIZE) self.memory = Memory(MEMORY_SIZE) self.learning_count = 0 self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) self.loss_func = nn.MSELoss()
class ReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = Memory(capacity=buffer_size, replay_beta=REPLAY_BETA, replay_alpha=REPLAY_ALPHA, replay_beta_increment=REPLAY_BETA_INCREMENT) self.batch_size = batch_size self.seed = random.seed(seed) self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" if len(self.memory) <= self.batch_size: error = random.random() else: error = self.memory.max_prio e = self.experience(state, action, reward, next_state, done) self.memory.add(error, e) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, ws = self.memory.sample(n=self.batch_size) states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones), idxs, ws def __len__(self): """Return the current size of internal memory.""" return len(self.memory)
def __init__(self, replay_size, memory_size=10000, prioritized=False): self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.model = self.create_model() self.prioritized = prioritized if self.prioritized: self.memory = Memory(capacity=memory_size)
class Agent(object): def __init__(self): self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet( ACTIONS_SIZE) self.memory = Memory(MEMORY_SIZE) self.learning_count = 0 self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) self.loss_func = nn.MSELoss() def action(self, state, israndom): if israndom and random.random() < EPSILON: return np.random.randint(0, ACTIONS_SIZE) state = torch.unsqueeze(torch.FloatTensor(state), 0) actions_value = self.network.forward(state) return torch.max(actions_value, 1)[1].data.numpy()[0] def learn(self, state, action, reward, next_state, done): old_val = self.network.forward(torch.FloatTensor([state])).gather( 1, torch.LongTensor([[action]]))[0] target_val = self.network.forward(torch.FloatTensor([state])) if done: done = 0 target = reward else: done = 1 target = reward + GAMMA * torch.max(target_val) error = abs(old_val[0] - target) self.memory.add(error.data, (state, action, reward, next_state, done)) if self.memory.tree.n_entries < MEMORY_THRESHOLD: return if self.learning_count % UPDATE_TIME == 0: self.target_network.load_state_dict(self.network.state_dict()) self.learning_count += 1 batch, idxs, is_weights = self.memory.sample(BATCH_SIZE) state = torch.FloatTensor([x[0] for x in batch]) action = torch.LongTensor([[x[1]] for x in batch]) reward = torch.FloatTensor([[x[2]] for x in batch]) next_state = torch.FloatTensor([x[3] for x in batch]) done = torch.FloatTensor([[x[4]] for x in batch]) eval_q = self.network.forward(state).gather(1, action) next_q = self.target_network(next_state).detach() target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done errors = torch.abs(eval_q - target_q).data.numpy().flatten() loss = self.loss_func(eval_q, target_q) for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def reset(self, seed=0): # Reset time self.t = 0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.env.size[0] * self.env.size[1], 1)
def reset(self,seed): # Reset time self.t=0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.n_features,self.env.nA)
def __init__(self, replay_size, memory_size=10000, prioritized=False): self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.tau = 1e-2 #MountainCar-v0 self.model = self.create_model() self.prioritized = prioritized self.target_model = self.create_model() self.target_model.set_weights(self.model.get_weights()) if self.prioritized: self.memory = Memory(capacity=memory_size)
def __init__(self): self.eval_net, self.target_net = Net(), Net() self.eval_net.cuda() self.target_net.cuda() # create prioritized replay memory using SumTree self.memory = Memory(Train_Configs.MEMORY_CAPACITY) self.learn_counter = 0 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=Train_Configs.LR,betas=(0.9, 0.99), eps=1e-08, weight_decay=2e-5) self.loss = nn.MSELoss(reduce=False, size_average=False) self.fig, self.ax = plt.subplots() self.discount_factor = Train_Configs.GAMMA
def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = Memory(capacity=buffer_size, replay_beta = REPLAY_BETA, replay_alpha = REPLAY_ALPHA, replay_beta_increment = REPLAY_BETA_INCREMENT) self.batch_size = batch_size self.seed = random.seed(seed) self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
def __init__(self, model_params, env_params): self.ep = env_params # Environemtn Parameters self.mp = model_params # Model Parameters self.model = self._build_model() # deque: list-like, optimzied for fast access at either end # self.memory = deque(maxlen = int(model_params.max_memory)) # Prioritized Memory class implementing Sum Tree self.memory = Memory(model_params.max_memory)
def __init__(self, state_size, action_size, seed, buffer_size=int(1e6), batch_size=64, gamma=0.99, tau=1e-3, update_every=3, num_mc_steps=5, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = tau self.UPDATE_EVERY = update_every self.num_mc_steps = num_mc_steps self.experiences = [ ExperienceQueue(num_mc_steps) for _ in range(num_agents) ] self.memory = Memory(buffer_size) self.t_step = 0 self.train_start = batch_size self.mad4pg_agent = [ D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax), D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax) ]
def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.config = config self.state_size = state_size self.action_size = action_size nodes = self.config.get("nodes", [128, 64]) self.seed = self.config.get("seed", 0) lr = self.config.get("lr", 1e-4) memory_size = self.config.get("memory_size", 100000) self.batch_size = self.config.get("batch_size", 256) self.discount = self.config.get("discount", 0.9) self.tau = self.config.get("tau", 0.001) self.epsilon = self.config.get("epsilon", 0.1) self.epsilon_end = self.config.get("epsilon_end", 0.0001) self.epsilon_decay = self.config.get("epsilon_decay", 0.995) self.learn_every = self.config.get("learn_every", 4) self.dqn = self.config.get("dqn", "simple") self.per = self.config.get("per", False) np.random.seed(self.seed) random.seed(self.seed) torch.manual_seed(self.seed) # Q-Network if self.dqn == "dueling": self.qnetwork_local = Dueling_QNetwork(state_size, action_size, self.seed).to(device) self.qnetwork_target = Dueling_QNetwork(state_size, action_size, self.seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, self.seed, nodes=nodes).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, nodes=nodes).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr= lr) # Replay memory if self.per: self.memory = Memory(memory_size) else: self.memory = ReplayBuffer(memory_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.scores = []
def __init__(self, state_size, action_size, random_seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.memory = Memory( capacity=self.buffer_size) # internal memory using SumTree self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Memory(BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size): self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size self.discount_factor = 0.99 self.learning_rate = 0.001 self.lr_step_size = 10 self.lr_gamma = 0.9 self.memory_size = 2**15 self.epsilon = 1.0 self.epsilon_min = 0.05 self.explore_step = 1000 self.epsilon_decay = 0.99995 self.batch_size = 64 self.train_start = 10000 # create prioritized replay memory using SumTree self.memory = Memory(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.scheduler = StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_gamma) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/per_dqn') self.model.train()
def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.learning_rate = 0.001 self.memory_size = 20000 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 5000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.batch_size = 64 self.train_start = 1000 # create prioritized replay memory using SumTree self.memory = Memory(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/cartpole_dqn')
def __init__(self, state_size: int = 37, action_size: int = 4, seed: int = 44, gamma: float = 0.99, tau: float = 1e-3): """ Initialize an Agent object. :param state_size: dimension of each state :param action_size: dimension of each action :param seed: random seed for network initialisation :param gamma: discount factor :param tau: lag for soft update of target network parameters """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.tau = tau self.max_w = 0 # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) # Prioritised Experience Replay memory self.memory = Memory(self.BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, replay_size, memory_size=10000, prioritized=False, load_models=False, actor_model_file='', critic_model_file='', is_eval=False): self.state_size = 2 self.action_size = 3 self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.prioritized = prioritized if self.prioritized: self.memory = Memory(capacity=memory_size) # Hyper parameters for learning self.value_size = 1 self.layer_size = 16 self.discount_factor = 0.99 self.actor_learning_rate = 0.0005 self.critic_learning_rate = 0.005 self.is_eval = is_eval # Create actor and critic neural networks self.actor = self.build_actor() self.critic = self.build_critic() #self.actor.summary() if load_models: if actor_model_file: self.actor.load_weights(actor_model_file) if critic_model_file: self.critic.load_weights(critic_model_file)
def __init__(self, learn_rate=0.001, img_size=(84, 84), num_frames=3, action_size=6, replay_size=64, max_memory=20000, is_test=False, num_episodes=10): self.img_size = tuple(img_size) # downsampling image size self.num_frames = int(num_frames) # Deterministic frameskip self.learn_rate = float(learn_rate) # optimizer learning rate self.action_size = int(action_size) # No. of possible actions in env self.num_epochs = int(1) # Epoch size used for training self.tau = float(0.01) # self.is_test = bool(is_test) # Memory self.replay_size = int( replay_size) # Size of minibatch sample from memory self.memory = Memory(int(max_memory)) # deque(maxlen=max_memory) # self.memory = deque(maxlen=max_memory) self.num_episodes = int(num_episodes) # Agent self._construct_q_network()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Memory(BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): state = torch.from_numpy(state).float().unsqueeze(0).to(device) next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() self.qnetwork_target.eval() with torch.no_grad(): target_action_values = self.qnetwork_target(next_state) expected_action_values = self.qnetwork_local(state) self.qnetwork_local.train() self.qnetwork_target.train() old_val = expected_action_values[0][action] new_val = reward if not done: new_val += GAMMA * torch.max(target_action_values) error = abs(old_val - new_val) # Save experience in replay memory self.memory.add(error, (state, action, reward, next_state, done)) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ mini_batches, idxs, is_weights = experiences states = torch.from_numpy(np.vstack([mini_batch[0] for mini_batch in mini_batches])).float().to(device) actions = torch.from_numpy(np.vstack([mini_batch[1] for mini_batch in mini_batches])).long().to(device) rewards = torch.from_numpy(np.vstack([mini_batch[2] for mini_batch in mini_batches])).float().to(device) next_states = torch.from_numpy(np.vstack([mini_batch[3] for mini_batch in mini_batches])).float().to(device) dones = torch.from_numpy(np.vstack([int(mini_batch[4]) for mini_batch in mini_batches])).float().to(device) ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" Q_source_next = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_target = self.qnetwork_target(next_states) Q_double_target = torch.tensor([Q_target[i][max_index] for i, max_index in enumerate(Q_source_next)]).detach().unsqueeze(1) Q_observed = rewards + (gamma * Q_double_target * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) errors = torch.abs(Q_expected - Q_observed).data.numpy() # update priority for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_observed)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class PERAgent(OffPolicyAgent): # construct agent's model separately, so it can be sized according to problem def __init__(self, n_replay, env, target_policy, behavior_policy, lr, discount, type = 'BC'): super().__init__(n_replay, env, target_policy, behavior_policy, lr, discount, type) # reseed numpy, reset weights of network # Reset must be performed before every episode def reset(self,seed): # Reset time self.t=0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.n_features,self.env.nA) def generate_action(self, s, target_policy_sel = True): pval = self.target_policy[s] if target_policy_sel else self.behavior_policy[s] return np.random.choice(a=self.actions, p=pval) def generate_all_actions(self,target_policy_sel = True): return np.array([self.generate_action(item, target_policy_sel) for item in range(self.target_policy.shape[0])]) # Generate steps of experience def generate_experience(self, k=16): # Initialize environment s = self.env.reset() done = False steps = 0 # For each step while steps < k: # choose action according to behavior policy a = self.generate_action(s,False) # Take a step in environment based on chosen action (s2,r,done,_) = self.env.step(a) # Compute importance ratios ratio = self.target_policy[s,a] / self.behavior_policy[s,a] # states and target action for Computing TD Error current_state = self.construct_features([s]) next_state = self.construct_features([s2]) target_policy_action = self.generate_action(s,True) # Get bootstrap estimate of next state action values value_s = self.model.predict([current_state,np.zeros(current_state.shape[0])]) value_next_s = self.model.predict([next_state,np.zeros(next_state.shape[0])]) updated_val = r if done else (r + self.discount*value_next_s[0][target_policy_action]) # Compute TD error td_error = np.abs(updated_val - value_s[0][a]) # Stop execution if weights blow up - not converged if td_error > 10**5: return 1 # Add experience to IR replay buffer self.replay_buffer.add_per(td_error, (s,a,r,s2)) # Set for next step s=s2 self.t += 1 steps += 1 # If episode ends, reset environment if done: done = False s = self.env.reset() return 0 # do batch of training using replay buffer def train_batch(self, n_samples, batch_size): # Sample a minibatch from replay buffer data_samples, idxs, ratios, buffer_total = self.replay_buffer.sample(n_samples) # Extract rewards, states, next states, actions from samples rewards = extract_transition_components(data_samples, TransitionComponent.reward) next_states = extract_transition_components(data_samples, TransitionComponent.next_state) next_state_features = self.construct_features(next_states) states = extract_transition_components(data_samples, TransitionComponent.state) state_features = self.construct_features(states) actions = extract_transition_components(data_samples, TransitionComponent.action) # Calculate Target policy actions target_policy_actions = np.array([self.generate_action(state, True) for state in states]) # Calculate state values for TD error next_values_sa = self.model.predict([next_state_features, np.zeros(next_state_features.shape[0])]) next_values = np.choose(target_policy_actions,next_values_sa.T) # v(s') is zero for terminal state, so need to fix model prediction for i in range(n_samples): # if experience ends in terminal state, value function returns 0 if next_states[i] == -1 or next_states[i] == 10: #TODO this only works for randomwalk of size 10 next_values[i] = 0.0 # Compute targets by bootstrap estimates targets = (rewards + self.discount*next_values) # Compute error for updating priorities pred_values = self.model.predict([state_features, np.zeros(state_features.shape[0])]) final_targets = np.copy(pred_values) np.put_along_axis(final_targets, np.expand_dims(actions,axis = 1),targets[:,np.newaxis],axis = 1) pred = np.choose(actions, pred_values.T) error = np.abs(pred - targets) # Priority update for i in range(batch_size): self.replay_buffer.update(idxs[i], error[i]) # train on samples self.model.fit([state_features, ratios], final_targets, batch_size=batch_size, verbose=0)
class Dqn(): def __init__(self): self.eval_net, self.target_net = Net(), Net() self.eval_net.cuda() self.target_net.cuda() # create prioritized replay memory using SumTree self.memory = Memory(Train_Configs.MEMORY_CAPACITY) self.learn_counter = 0 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=Train_Configs.LR,betas=(0.9, 0.99), eps=1e-08, weight_decay=2e-5) self.loss = nn.MSELoss(reduce=False, size_average=False) self.fig, self.ax = plt.subplots() self.discount_factor = Train_Configs.GAMMA def store_trans(self, state_path, action, reward, next_state_path,done): ## action type: id x, y, c = my_utils.translate_actionID_to_XY_and_channel(action) trans = state_path+'#'+str(action)+'#'+str(reward)+'#'+next_state_path#np.hstack((state, [action], [reward], next_state)) #------ calculate TD errors from (s,a,r,s'), #--only from the first depth image, without considering other 9 rotated depth images state_d = state_path next_state_d = next_state_path if c > 0: state_d = my_utils.get_rotate_depth(c,state_d) next_state_d = my_utils.get_rotate_depth(c, next_state_d) state_depth = my_utils.copy_depth_to_3_channel(state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_depth = my_utils.copy_depth_to_3_channel(next_state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) if c == 0: state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(next_state_path.replace('npy','png').replace('state_depth', 'state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) else: state_rgb = my_utils.get_rotate_rgb(c,state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_rgb = my_utils.get_rotate_rgb(c,next_state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) # # normlize # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # next_state_depth = (next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # numpy to tensor state_depth = torch.cuda.FloatTensor(state_depth) next_state_depth = torch.cuda.FloatTensor(next_state_depth) state_rgb = torch.cuda.FloatTensor(state_rgb) next_state_rgb = torch.cuda.FloatTensor(next_state_rgb) target_singleChannel_q_map = self.eval_net.forward(state_rgb,state_depth)#dim:[1,1,224,224],CHANNEL=1 # x,y,c = my_utils.translate_actionID_to_XY_and_channel(action) old_val = target_singleChannel_q_map[0][0][x][y] # old_val = target[0][action] target_val_singleChannel_q_map = self.target_net.forward(next_state_rgb,next_state_depth)#dim:[1,1,224,224] if done == 1: target_q = reward # target[0][action] = reward else: target_q = reward + self.discount_factor * torch.max(target_val_singleChannel_q_map) # target[0][action] = reward + self.discount_factor * torch.max(target_val) error = abs(old_val - target_q) self.memory.add(float(error), trans) def choose_action(self, state_path,EPSILON): state_rgb = [] state_depth = [] state_rgb.append(my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image')))) state_depth.append(my_utils.copy_depth_to_3_channel(state_path))#dim:[3, DIM_STATES[0], DIM_STATES[1]]#.reshape(1, 3, DIM_STATES[0], DIM_STATES[1])) for i in range(1,Train_Configs.ROTATION_BINS): state_rotate_rgb = my_utils.get_rotate_rgb(i,state_path.replace('npy','png').replace('state_depth','state_image')) state_rgb.append(state_rotate_rgb) #------------------------ state_rotate_depth = my_utils.get_rotate_depth(i,state_path) state_rotate_3_depth = my_utils.copy_depth_to_3_channel(state_rotate_depth) state_depth.append(state_rotate_3_depth) state_rgb = np.array(state_rgb) state_depth = np.array(state_depth) # # normlize # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # numpy to tensor state_rgb = torch.cuda.FloatTensor(state_rgb) # dim:[INPUT_IMAGE,3,224,224] state_depth = torch.cuda.FloatTensor(state_depth) #dim:[INPUT_IMAGE,3,224,224] # random exploration prob = np.min((EPSILON,1)) p_select = np.array([prob, 1 - prob]) selected_ac_type = np.random.choice([0, 1], p=p_select.ravel()) if selected_ac_type == 0:#origin predicted action target_multiChannel_q_map = self.eval_net.forward(state_rgb,state_depth) # dim:[INPUT_IMAGES,1,224,224] action = my_utils.find_maxQ_in_qmap(target_multiChannel_q_map.cpu().detach().numpy()) ac_ty = '0' else: if np.random.randn() <= 0.5:#sample action according to depth image action = my_utils.select_randpID_from_mask(state_path) ac_ty = '1' else:# random sample action = np.random.randint(0,DIM_ACTIONS) ac_ty = '2' return ac_ty,action # the id of action def plot(self, ax, x): ax.cla() ax.set_xlabel("episode") ax.set_ylabel("total reward") ax.plot(x, 'b-') plt.pause(0.000000000000001) def load_batch_data(self,batch_list):#batch_list.dim:[batch_size] # print(batch_list) batch_state_rgb = [] batch_state_depth = [] batch_action = [] batch_reward = [] batch_next_state_rgb = [] batch_next_state_depth = [] for item in batch_list: data = item.split('#')#state+'#'+str(action)+'#'+str(reward)+'#'+next_state action_id = int(data[1]) batch_state_rgb.append(my_utils.get_rotate_rgb(action_id,data[0].replace('npy','png').replace('state_depth','state_image'))) batch_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[0])).reshape((3,DIM_STATES[0],DIM_STATES[1]))) batch_action.append([int(data[1])]) batch_reward.append([float(data[2])]) batch_next_state_rgb.append(my_utils.get_rotate_rgb(action_id, data[3].replace('npy','png').replace('state_depth', 'state_image'))) batch_next_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[3])).reshape((3,DIM_STATES[0],DIM_STATES[1]))) batch_state_depth = np.array(batch_state_depth) batch_next_state_depth = np.array(batch_next_state_depth) # # normlize # batch_state_depth = (batch_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # batch_next_state_depth = (batch_next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) return torch.cuda.FloatTensor(batch_state_rgb),torch.cuda.FloatTensor(batch_state_depth),torch.cuda.LongTensor(batch_action),torch.cuda.FloatTensor(batch_reward),torch.cuda.FloatTensor(batch_next_state_rgb),torch.cuda.FloatTensor(batch_next_state_depth) def learn(self): # learn 100 times then the target network update if self.learn_counter % Train_Configs.Q_NETWORK_ITERATION ==0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_counter+=1 mini_batch, idxs, is_weights = self.memory.sample(Train_Configs.BATCH_SIZE)# batch_state_rgb,batch_state_depth,batch_action,batch_reward,batch_next_state_rgb,batch_next_state_depth = self.load_batch_data(mini_batch)#dim:[1] eval_singleChannel_q_map = self.eval_net(batch_state_rgb,batch_state_depth) # dim:[BATCH_SIZE,1,224,224] x_y_c_list = my_utils.translate_actionID_to_XY_and_channel_batch(batch_action) # old_val = target_multiChannel_q_map[0][c][x][y] batch_q = [] # for xyc in x_y_c_list: for i in range(len(x_y_c_list)): xyc = x_y_c_list[i] batch_q.append([eval_singleChannel_q_map[i][0][xyc[0]][xyc[1]]]) q_eval = torch.cuda.FloatTensor(batch_q)#self.eval_net(batch_state).gather(1, batch_action)#action: a value in range [0,DIM_ACTIONS-1] q_eval = Variable(q_eval.cuda(), requires_grad=True) target_singleChannel_q_map = self.target_net(batch_next_state_rgb,batch_next_state_depth).cpu().detach().numpy()#q_next,dim:[BATCH_SIZE,1,224,224] batch_q_next = [] for b_item in target_singleChannel_q_map:#dim:[1,224,224] batch_q_next.append([np.max(b_item)]) q_next = torch.cuda.FloatTensor(batch_q_next) # q_next = Variable(q_next.cuda(), requires_grad=True) q_target = batch_reward + Train_Configs.GAMMA*q_next q_target = Variable(q_target.cuda(), requires_grad=True) # self.average_q = q_eval.mean() weight_tensor = torch.cuda.FloatTensor(is_weights)# weight_tensor = weight_tensor.reshape((Train_Configs.BATCH_SIZE,1)) weight_tensor = Variable(weight_tensor.cuda(), requires_grad=False) loss = (weight_tensor * self.loss(q_eval, q_target)).mean()##(torch.FloatTensor(is_weights) * F.mse_loss(pred, target)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return float(loss),float(q_eval.mean())
class MAD4PG: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e6), batch_size=64, gamma=0.99, tau=1e-3, update_every=3, num_mc_steps=5, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = tau self.UPDATE_EVERY = update_every self.num_mc_steps = num_mc_steps self.experiences = [ ExperienceQueue(num_mc_steps) for _ in range(num_agents) ] self.memory = Memory(buffer_size) self.t_step = 0 self.train_start = batch_size self.mad4pg_agent = [ D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax), D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax) ] def acts(self, states, add_noise=0.0): acts = [] for s, a in zip(states, self.mad4pg_agent): acts.append(a.act(np.expand_dims(s, 0), add_noise)) return np.vstack(acts) # borrow from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter14 def distr_projection(self, next_distr_v, rewards_v, dones_mask_t, gamma): next_distr = next_distr_v.data.cpu().numpy() rewards = rewards_v.data.cpu().numpy() dones_mask = dones_mask_t.cpu().numpy().astype(np.bool) batch_size = len(rewards) proj_distr = np.zeros((batch_size, N_ATOMS), dtype=np.float32) dones_mask = np.squeeze(dones_mask) rewards = rewards.reshape(-1) for atom in range(N_ATOMS): tz_j = np.minimum( Vmax, np.maximum(Vmin, rewards + (Vmin + atom * DELTA_Z) * gamma)) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] ne_mask = u != l proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] if dones_mask.any(): proj_distr[dones_mask] = 0.0 tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones_mask])) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l if dones_mask.shape == (): if dones_mask: proj_distr[0, l] = 1.0 else: ne_mask = u != l proj_distr[0, l] = (u - b_j)[ne_mask] proj_distr[0, u] = (b_j - l)[ne_mask] else: eq_dones = dones_mask.copy() eq_dones[dones_mask] = eq_mask if eq_dones.any(): proj_distr[eq_dones, l[eq_mask]] = 1.0 ne_mask = u != l ne_dones = dones_mask.copy() ne_dones[dones_mask] = ne_mask if ne_dones.any(): proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] return torch.FloatTensor(proj_distr).to(device) def step(self, states, actions, rewards, next_states, dones): for agent_index in range(len(self.mad4pg_agent)): agent_experiences = self.experiences[agent_index] agent_experiences.states.appendleft(states[agent_index]) agent_experiences.rewards.appendleft(rewards[agent_index] * self.GAMMA**self.num_mc_steps) agent_experiences.actions.appendleft(actions[agent_index]) if len(agent_experiences.rewards) == self.num_mc_steps or dones[ agent_index]: # N-steps return: r= r1+gamma*r2+..+gamma^(t-1)*rt done_tensor = torch.tensor( dones[agent_index]).float().to(device) condition = True while condition: for i in range(len(agent_experiences.rewards)): agent_experiences.rewards[i] /= self.GAMMA state = torch.tensor( agent_experiences.states[-1]).float().unsqueeze(0).to( device) next_state = torch.tensor( next_states[agent_index]).float().unsqueeze(0).to( device) action = torch.tensor( agent_experiences.actions[-1]).float().unsqueeze(0).to( device) sum_reward = torch.tensor(sum( agent_experiences.rewards)).float().unsqueeze(0).to( device) with evaluating( self.mad4pg_agent[agent_index]) as cur_agent: q_logits_expected = cur_agent.critic_local( state, action) action_next = cur_agent.actor_target(next_state) q_target_logits_next = cur_agent.critic_target( next_state, action_next) q_target_distr_next = F.softmax(q_target_logits_next, dim=1) q_target_distr_next_projected = self.distr_projection( q_target_distr_next, sum_reward, done_tensor, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax( q_logits_expected, dim=1) * q_target_distr_next_projected error = cross_entropy.sum(dim=1).mean().cpu().data self.memory.add( error, (states[agent_index], actions[agent_index], sum_reward, next_states[agent_index], dones[agent_index])) agent_experiences.states.pop() agent_experiences.rewards.pop() agent_experiences.actions.pop() condition = False and dones[agent_index] and len( agent_experiences.states) > 0 if dones[agent_index]: agent_experiences.states.clear() agent_experiences.rewards.clear() agent_experiences.actions.clear() self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn # print(self.memory.tree.n_entries) if self.memory.tree.n_entries > self.train_start: for agent_index in range(len(self.mad4pg_agent)): sampled_experiences, idxs = self.sample() self.learn(self.mad4pg_agent[agent_index], sampled_experiences, idxs) def sample(self): # prioritized experience replay mini_batch, idxs, is_weights = self.memory.sample(self.BATCH_SIZE) mini_batch = np.array(mini_batch).transpose() statess = np.vstack([m for m in mini_batch[0] if m is not None]) actionss = np.vstack([m for m in mini_batch[1] if m is not None]) rewardss = np.vstack([m for m in mini_batch[2] if m is not None]) next_statess = np.vstack([m for m in mini_batch[3] if m is not None]) doness = np.vstack([m for m in mini_batch[4] if m is not None]) # bool to binary doness = doness.astype(int) statess = torch.from_numpy(statess).float().to(device) actionss = torch.from_numpy(actionss).float().to(device) rewardss = torch.from_numpy(rewardss).float().to(device) next_statess = torch.from_numpy(next_statess).float().to(device) doness = torch.from_numpy(doness).float().to(device) return (statess, actionss, rewardss, next_statess, doness), idxs def learn(self, agent, experiences, idxs): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # Compute critic loss q_logits_expected = agent.critic_local(states, actions) actions_next = agent.actor_target(next_states) q_targets_logits_next = agent.critic_target(next_states, actions_next) q_targets_distr_next = F.softmax(q_targets_logits_next, dim=1) q_targets_distr_projected_next = self.distr_projection( q_targets_distr_next, rewards, dones, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax(q_logits_expected, dim=1) * q_targets_distr_projected_next critic_loss = cross_entropy.sum(dim=1).mean() with torch.no_grad(): errors = cross_entropy.sum(dim=1).cpu().data.numpy() # update priority for i in range(self.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() agent.critic_optimizer.step() # Compute actor loss actions_pred = agent.actor_local(states) crt_distr_v = agent.critic_local(states, actions_pred) actor_loss = -agent.critic_local.distr_to_q(crt_distr_v) actor_loss = actor_loss.mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, self.TAU) agent.soft_update(agent.actor_local, agent.actor_target, self.TAU)
discount_factor = 0.5 mini_batch_size = 10 copy_target_net = 6 train_iter = 30 save_freq = 5 save_root = "models_with_novel_422_batch_{}".format(mini_batch_size) primitive_lr = 2.5e-4 dexnet_lr = 5e-5 suction_1_sampled = np.zeros(memory_capacity[0]) suction_2_sampled = np.zeros(memory_capacity[1]) gripper_sampled = np.zeros(memory_capacity[2]) #model_str = "../training/logger_010/models/10_121.pth" model_str = "evaluate_model/600.pth" suction_1_memory = Memory(memory_capacity[0]) suction_2_memory = Memory(memory_capacity[1]) gripper_memory = Memory(memory_capacity[2]) suction_1_memory.load_memory("suction_1_mixed_400.pkl") suction_2_memory.load_memory("suction_2_mixed_200.pkl") gripper_memory.load_memory("gripper_mixed_200.pkl") #suction_1_memory.tree.reset_priority() #suction_2_memory.tree.reset_priority() #gripper_memory.tree.reset_priority() compare_color = "../training/logger_009/images/color_000005.jpg" compare_depth = "../training/logger_009/depth_data/depth_data_000005.npy" def create_path(): cwd = os.getcwd() + "/" + save_root
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.memory = Memory( capacity=self.buffer_size) # internal memory using SumTree self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, batch_size=BATCH_SIZE, update_every=UPDATE_EVERY): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: # Learn, if enough samples are available in memory if self.memory.tree.n_entries >= batch_size: experiences, idxs, is_weights = self.sample() self.learn(experiences, idxs, is_weights) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: #action = [act + self.noise.sample() for act in action] action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, idxs, is_weights, batch_size=BATCH_SIZE, gamma=GAMMA): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #Loss calculation critic_loss = (torch.from_numpy(is_weights).float().to(device) * F.mse_loss(Q_expected, Q_targets)).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #Introducing gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) #.......................update priorities in prioritized replay buffer.......# #Calculate errors used in prioritized replay buffer errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy() # update priority for i in range(batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def add(self, state, action, reward, next_state, done, gamma=GAMMA): """Add a new experience to memory.""" next_state_torch = torch.from_numpy(next_state).float().to(device) reward_torch = torch.unsqueeze( torch.from_numpy(np.array(reward)).float().to(device), 1) done_torch = torch.unsqueeze( torch.from_numpy(np.array(done).astype( np.uint8)).float().to(device), 1) state_torch = torch.from_numpy(state).float().to(device) action_torch = torch.from_numpy(action).float().to(device) self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() with torch.no_grad(): action_next = self.actor_target(next_state_torch) Q_target_next = self.critic_target(next_state_torch, action_next) Q_target = reward_torch + (gamma * Q_target_next * (1 - done_torch)) Q_expected = self.critic_local(state_torch, action_torch) self.actor_local.train() self.critic_target.train() self.critic_local.train() #Error used in prioritized replay buffer error = (Q_expected - Q_target).squeeze().cpu().data.numpy() #Adding experiences to prioritized replay buffer for i in np.arange(len(reward)): self.memory.add( error[i], (state[i], action[i], reward[i], next_state[i], done[i])) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, is_weights = self.memory.sample(self.batch_size) states = np.vstack([e[0] for e in experiences]) states = torch.from_numpy(states).float().to(device) actions = np.vstack([e[1] for e in experiences]) actions = torch.from_numpy(actions).float().to(device) rewards = np.vstack([e[2] for e in experiences]) rewards = torch.from_numpy(rewards).float().to(device) next_states = np.vstack([e[3] for e in experiences]) next_states = torch.from_numpy(next_states).float().to(device) dones = np.vstack([e[4] for e in experiences]).astype(np.uint8) dones = torch.from_numpy(dones).float().to(device) return (states, actions, rewards, next_states, dones), idxs, is_weights
lr = 0.00001 C = 10000 clip_norm = 1 replay_buffer_size = 100000 DOUBLE_DQN = True load_weights = False average_proportion = 0.01 OPT = Adam Hindsight_experience_replay = True # Variables env = GoalEnvironment(random_GOAL = True) #env = gym.make('Acrobot-v1') #env = GridGoalEnvironment(n = 10,shuffle_goal = True, discrete_goal = True, stochastic = True, holonomic = False) replay_buffer = Memory(replay_buffer_size) #replay_buffer = list() timestep = 0 episode_timestep_history = list() reward_history = list() C = C *update_frequency exploration_linear_decay = 1/exploration_annealing_frames running_reward = 0 scaler = StandardScaler() min_reward = - 1/(1-gamma) temporal_diff_mean = 0 update_count = 0 # Model definition and clonning Q_net = Sequential() Q_net.add(Dense(64, input_shape = env.observation_space.shape, activation = 'relu'))
class DQNAgent(): def __init__(self, state_size, action_size): self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size self.discount_factor = 0.99 self.learning_rate = 0.001 self.lr_step_size = 10 self.lr_gamma = 0.9 self.memory_size = 2**15 self.epsilon = 1.0 self.epsilon_min = 0.05 self.explore_step = 1000 self.epsilon_decay = 0.99995 self.batch_size = 64 self.train_start = 10000 # create prioritized replay memory using SumTree self.memory = Memory(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.scheduler = StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_gamma) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/per_dqn') self.model.train() # weight xavier initialize def weights_init(self, m): classname = m.__class__.__name__ if classname.find('Linear') != -1: torch.nn.init.xavier_uniform_(m.weight) # after some time interval update the target model to be same with model def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) # get action from model using epsilon-greedy policy def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = torch.from_numpy(state).float() q_value = self.model(state) _, action = torch.max(q_value, 1) return int(action) # save sample (error,<s,a,r,s'>) to the replay memory def append_sample(self, state, action, reward, next_state, done): target = self.model(torch.tensor(state).float()).data old_val = target[0][action] target_val = self.target_model(torch.tensor(next_state).float()).data if done: target[0][action] = reward else: target[0][action] = reward + \ self.discount_factor * torch.max(target_val) error = abs(old_val - target[0][action]) self.memory.add(error, (state, action, reward, next_state, done)) # pick samples from prioritized replay memory (with batch_size) def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = np.vstack(mini_batch[0]) actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.vstack(mini_batch[3]) dones = mini_batch[4] # bool to binary dones = dones.astype(int) # Q function of current state states = torch.tensor(states).float() pred = self.model(states) # one-hot encoding a = torch.tensor(actions, dtype=torch.long).view(-1, 1) one_hot_action = torch.zeros(self.batch_size, self.action_size) one_hot_action.scatter_(1, a, 1) pred = torch.sum(pred.mul(one_hot_action), dim=1) # Q function of next state next_states = torch.tensor(next_states, dtype=torch.float) next_pred = self.target_model(next_states.float()).data rewards = torch.tensor(rewards, dtype=torch.float) dones = torch.tensor(dones, dtype=torch.float) # Q Learning: get maximum Q value at s' from target model target = rewards + (1 - dones) * \ self.discount_factor * next_pred.max(1)[0] errors = torch.abs(pred - target).data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() # MSE Loss function loss = (torch.tensor(is_weights).float() * F.mse_loss(pred, target)).mean() loss.backward() # and train self.optimizer.step() return loss.item()
def __init__(self, state_size, action_size, seed, BUFFER_SIZE=int(1e5), BATCH_SIZE=64, GAMMA=0.99, TAU=1e-3, LR_ACTOR=1e-4, LR_CRITIC=3e-4, WEIGHT_DECAY=0.0001, UPDATE_EVERY=4, IsPR=False, N_step=1, IsD4PG_Cat=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.BUFFER_SIZE = BUFFER_SIZE self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.TAU = TAU self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.UPDATE_EVERY = UPDATE_EVERY self.N_step = N_step self.IsD4PG_Cat = IsD4PG_Cat self.rewards_queue = deque(maxlen=N_step) self.states_queue = deque(maxlen=N_step) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) if IsD4PG_Cat: self.critic_local = CriticD4PG(state_size, action_size, seed, n_atoms=N_ATOMS, v_min=Vmin, v_max=Vmax).to(device) self.critic_target = CriticD4PG(state_size, action_size, seed, n_atoms=N_ATOMS, v_min=Vmin, v_max=Vmax).to(device) else: self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Replay memory self.BATCH_SIZE = BATCH_SIZE self.IsPR = IsPR if IsPR: self.memory = Memory(BUFFER_SIZE) # prioritized experienc replay else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.train_start = 2000
from prioritized_memory import Memory import numpy as np import cv2 from utils import Transition R = 5.0 m1 = Memory(1000) m2 = Memory(1000) m3 = Memory(1000) M1 = Memory(1500) M2 = Memory(1500) M3 = Memory(1500) m1.load_memory("../training/logger_013/suction_1_memory.pkl") m2.load_memory("../training/logger_013/suction_2_memory.pkl") m3.load_memory("../training/logger_013/gripper_memory.pkl") empty_color = [] empty_depth = [] for i in range(m1.length): M1.add(m1.tree.data[i]) M2.add(m2.tree.data[i]) M3.add(m3.tree.data[i]) for i in range(m1.length): # Invalid point is common if m1.tree.data[i].reward == -3 * R: transition = m1.tree.data[i] pixel_index = transition.pixel_idx