Python ReplayMemory.store Examples

Programming Language: Python

Namespace/Package Name: buffer

Class/Type: ReplayMemory

Method/Function: store

Examples at hotexamples.com: 2

Python ReplayMemory.store - 2 examples found. These are the top rated real world Python examples of buffer.ReplayMemory.store extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReplayMemory(7)

push(4)

sample(2)

store(2)

add(1)

get_minibatch(1)

Example #1

Show file

class AgentSep1D(Agent):
    def __init__(self, name, pars, nrenvs=1, job=None, experiment=None):
        Agent.__init__(self, name, pars, nrenvs, job, experiment)

    def build(self):
        self.policy_net1 = DQN(71, self.pars).to(self.device)
        self.target_net1 = DQN(71, self.pars).to(self.device)
        self.target_net1.load_state_dict(self.policy_net1.state_dict())
        self.target_net1.eval()

        self.policy_net2 = DQN(71, self.pars).to(self.device)
        self.target_net2 = DQN(71, self.pars).to(self.device)
        self.target_net2.load_state_dict(self.policy_net2.state_dict())
        self.target_net2.eval()

        self.optimizer1 = optim.SGD(self.policy_net1.parameters(),
                                    lr=self.pars['lr'],
                                    momentum=self.pars['momentum'])  #
        self.optimizer2 = optim.SGD(self.policy_net2.parameters(),
                                    lr=self.pars['lr'],
                                    momentum=self.pars['momentum'])  #
        self.optimizer1 = optim.Adam(self.policy_net1.parameters())
        self.optimizer2 = optim.Adam(self.policy_net2.parameters())
        self.memory2 = ReplayMemory(10000)
        self.memory1 = ReplayMemory(10000)

    def getaction(self, state1, state2, test=False):
        mes = torch.tensor([[0, 0, 0, 0]], device=self.device)
        comm2 = self.policy_net1(
            state2, 0,
            mes)[self.idC].detach() if np.random.rand() < self.prob else mes
        comm1 = self.policy_net2(
            state1, 0,
            mes)[self.idC].detach() if np.random.rand() < self.prob else mes
        if test:
            action1 = self.policy_net1(state1, 1,
                                       comm2)[0].max(1)[1].view(1, 1)
            action2 = self.policy_net2(state2, 1,
                                       comm1)[0].max(1)[1].view(1, 1)
        else:
            action1 = self.select_action(state1, comm2, self.policy_net1)
            action2 = self.select_action(state2, comm1, self.policy_net2)
        return action1, action2, [comm1, comm2]

    def getStates(self, env):
        screen1 = env.render_env_1d()  #.transpose((2, 0, 1))
        return torch.from_numpy(screen1).unsqueeze(0).to(
            self.device), torch.from_numpy(screen1).unsqueeze(0).to(
                self.device)

    def saveStates(self, state1, state2, action1, action2, next_state1,
                   next_state2, reward1, reward2, env_id):
        self.capmem += 2
        if self.pars['ppe'] != '1':
            self.memory2.push(state2, action2, next_state2, reward2, state1)
            self.memory1.push(state1, action1, next_state1, reward1, state2)
        else:
            self.memory1.store([state1, action1, next_state1, reward1, state2])
            self.memory2.store([state2, action2, next_state2, reward2, state1])
            #self.memory2.push(state2, action2, next_state2, reward2, state1)
            #self.memory1.push(state1, action1, next_state1, reward1, state2)
    def optimize(self):
        self.optimize_model(self.policy_net1, self.target_net1, self.memory1,
                            self.optimizer1)
        self.optimize_model(self.policy_net2, self.target_net2, self.memory2,
                            self.optimizer2)

    def updateTarget(self, i_episode, step=False):
        #soft_update(self.target_net, self.policy_net, tau=0.01)
        if step:
            return
        if i_episode % self.TARGET_UPDATE == 0:
            self.target_net1.load_state_dict(self.policy_net1.state_dict())
            self.target_net2.load_state_dict(self.policy_net2.state_dict())

    def save(self):
        torch.save(self.policy_net1.state_dict(),
                   self.pars['results_path'] + self.name + '/model1')
        torch.save(self.policy_net2.state_dict(),
                   self.pars['results_path'] + self.name + '/model2')

    def perturb_learning_rate(self, i_episode, nolast=True):
        if nolast:
            new_lr_factor = 10**np.random.normal(scale=1.0)
            new_momentum_delta = np.random.normal(scale=0.1)
            self.EPS_DECAY += np.random.normal(scale=50.0)
            if self.EPS_DECAY < 50:
                self.EPS_DECAY = 50
            if self.prob >= 0:
                self.prob += np.random.normal(scale=0.05) - 0.025
                self.prob = min(max(0, self.prob), 1)
        for param_group in self.optimizer1.param_groups:
            if nolast:
                param_group['lr'] *= new_lr_factor
                param_group['momentum'] += new_momentum_delta
            self.momentum1 = param_group['momentum']
            self.lr1 = param_group['lr']
        if nolast:
            new_lr_factor = 10**np.random.normal(scale=1.0)
            new_momentum_delta = np.random.normal(scale=0.1)
        for param_group in self.optimizer2.param_groups:
            if nolast:
                param_group['lr'] *= new_lr_factor
                param_group['momentum'] += new_momentum_delta
            self.momentum2 = param_group['momentum']
            self.lr2 = param_group['lr']
        with open(
                os.path.join(self.pars['results_path'] + self.name,
                             'hyper-{}.json').format(i_episode),
                'w') as outfile:
            json.dump(
                {
                    'lr1': self.lr1,
                    'momentum1': self.momentum1,
                    'lr2': self.lr2,
                    'momentum2': self.momentum2,
                    'eps_decay': self.EPS_DECAY,
                    'prob': self.prob,
                    'i_episode': i_episode
                }, outfile)

    def clone(self, agent):
        state_dict = agent.policy_net1.state_dict()
        self.policy_net1.load_state_dict(state_dict)
        state_dict = agent.optimizer1.state_dict()
        self.optimizer1.load_state_dict(state_dict)
        state_dict = agent.policy_net2.state_dict()
        self.policy_net2.load_state_dict(state_dict)
        state_dict = agent.optimizer2.state_dict()
        self.optimizer2.load_state_dict(state_dict)
        self.target_net1.load_state_dict(self.policy_net1.state_dict())
        self.target_net2.load_state_dict(self.policy_net2.state_dict())
        self.EPS_DECAY = agent.EPS_DECAY

Example #2

Show file

class AgentACShare1D(Agent):
    def __init__(self, name, pars, nrenvs=1, job=None, experiment=None):
        Agent.__init__(self,name, pars, nrenvs, job, experiment)
    def build(self):
        self.policy_net = DQN(71, self.pars).to(self.device)
        self.q_net = DQN(71, self.pars).to(self.device)
        self.target_net = DQN(71, self.pars).to(self.device)
        self.target_net.load_state_dict(self.q_net.state_dict())
        self.target_net.eval()
        
        if self.pars['momentum']>0:
            self.optimizer = optim.SGD(
                    self.q_net.parameters(), lr=self.pars['lr'], 
                    momentum=self.pars['momentum'])#
            self.policy_optimizer = optim.SGD(
                    self.policy_net.parameters(), lr=self.pars['lr'], 
                    momentum=self.pars['momentum'])#
        else:
            self.optimizer = optim.Adam(self.q_net.parameters())
            self.policy_optimizer = optim.Adam(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        
        self.eps_threshold = 0.01
        self.bufs = [[] for _ in range(len(self.envs)*2)]
        
    def updateTarget(self, i_episode, step=False):
        #soft_update(self.target_net, self.policy_net, tau=0.01)
        if step:
            return
        self.optimize_policy(self.policy_net, self.bufs, self.policy_optimizer)
        
        if i_episode % self.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.q_net.state_dict())
            self.eps_threshold -= 0.001
    def saveStates(self, state1, state2, action1,action2, next_state1,next_state2, reward1,reward2, env_id):
                    logp1, ent1, logp2, ent2 = self.rem
                    if self.pars['ppe']!='1':
                        self.memory.push(state2, action2, next_state2, reward2, state1)
                        self.memory.push(state1, action1, next_state1, reward1, state2)
                    else:
                        self.memory.store([state1, action1, next_state1, reward1, state2])
                        self.memory.store([state2, action2, next_state2, reward2, state1])
                    #self.buf2.append([state2, action2,1, reward2, logp2, ent2])
                    #self.buf1.append([state1, action1,1, reward1, logp1, ent1])
                    
                    self.bufs[2*env_id  ].append([state2, action2,1, reward2, logp2, ent2])
                    self.bufs[2*env_id+1].append([state1, action1,1, reward1, logp1, ent1])
    def select_action(self, state, comm, policy_net):
        probs1, _ = policy_net(state, 1, comm)#.cpu().data.numpy()
        m = Categorical(logits=probs1)
        action = m.sample()
        return action.view(1, 1), m.log_prob(action), m.entropy()
    
    def getComm(self, mes, policy_net, state1_batch):
        return self.policy_net(state1_batch, 1, mes)[self.idC].detach() if np.random.rand()<self.prob else mes
    
    def getaction(self, state1, state2, test=False):
        mes = torch.tensor([[0,0,0,0]], device=self.device)
        #maybe error
        comm2 = self.policy_net(state2, 0, mes)[self.idC] if (test and  0<self.prob) or np.random.rand()<self.prob else mes
        comm1 = self.policy_net(state1, 0, mes)[self.idC] if (test and  0<self.prob) or np.random.rand()<self.prob else mes
        
        action1, logp1, ent1 = self.select_action(state1,  comm2, self.policy_net)
        action2, logp2, ent2 = self.select_action(state2,  comm1, self.policy_net)
        self.rem =[logp1, ent1, logp2, ent2]
        return action1, action2, [comm1, comm2]
    def optimize_policy(self, policy_net, memories, optimizer):
        policy_loss = 0
        value_loss = 0
        ent = 0
        for memory in memories:#[memory1, memory2]:
            R = torch.zeros(1, 1, device=self.device)
            #GAE = torch.zeros(1, 1, device=self.device)
            saved_r = torch.cat([c[3].float() for c in memory])
            states = torch.cat([c[0].float() for c in memory])
            action_batch = torch.cat([c[1].float() for c in memory]).view(-1,1)
            mes = torch.tensor([[0,0,0,0] for i in memory], device=self.device)
            actionV = self.q_net(states, 0, mes)[0].gather(1, action_batch.long())
            mu = saved_r.mean()
            std = saved_r.std()
            eps = 0.000001       
            #print(memory)
            for i in reversed(range(len(memory)-1)):
                    _,_,_,r,log_prob, entr = memory[i]
                    ac = (actionV[i] - mu) / (std + eps)#actionV[i]#also use mu and std
                    #Discounted Sum of Future Rewards + reward for the given state
                    R = self.GAMMA * R + (r.float() - mu) / (std + eps)
                    advantage = R - ac
                    policy_loss += -log_prob *advantage .detach()
                    #ent += entr#*0
                    
        optimizer.zero_grad()
        (policy_loss.mean()  + self.eps_threshold*ent).backward()
        for param in policy_net.parameters():
            if param.grad is not None:
                param.grad.data.clamp_(-1, 1)
        optimizer.step()
    def save(self):
        torch.save(self.policy_net.state_dict(), self.pars['results_path']+self.name+'/model')
        torch.save(self.q_net.state_dict(), self.pars['results_path']+self.name+'/modelQ')
    def load(self, PATH):
        #torch.cuda.is_available()
        self.policy_net.load_state_dict(torch.load(PATH, map_location= 'cuda' if torch.cuda.is_available() else 'cpu')) 
        self.q_net.load_state_dict(torch.load(PATH+'Q', map_location= 'cuda' if torch.cuda.is_available() else 'cpu')) 
        self.target_net.load_state_dict(self.q_net.state_dict())
        
    def optimize(self):
        self.optimize_model(self.q_net, self.target_net, self.memory, self.optimizer)
        
    def perturb_learning_rate(self, i_episode, nolast=True):
        if nolast:
            new_lr_factor = 10**np.random.normal(scale=1.0)
            new_momentum_delta = np.random.normal(scale=0.1)
            self.eps_threshold += np.random.normal(scale=0.1)
            self.alpha += np.random.normal(scale=0.1)
            if self.alpha>1:
                self.alpha = 1
            if self.alpha<0.5:
                self.alpha = 0.5
            if self.eps_threshold<0:
                self.eps_threshold = 0.00001
            self.EPS_DECAY += np.random.normal(scale=50.0)
            if self.EPS_DECAY<50:
                self.EPS_DECAY = 50
            if self.prob>=0:
                self.prob += np.random.normal(scale=0.05)-0.025
                self.prob = min(max(0,self.prob),1)
        for param_group in self.optimizer.param_groups:
            if nolast:
                param_group['lr'] *= new_lr_factor
                param_group['momentum'] += new_momentum_delta
            self.momentum =param_group['momentum']
            self.lr = param_group['lr']
        if nolast:
            new_lr_factor = 10**np.random.normal(scale=1.0)
            new_momentum_delta = np.random.normal(scale=0.1)
        for param_group in self.policy_optimizer.param_groups:
            if nolast:
                param_group['lr'] *= new_lr_factor
                param_group['momentum'] += new_momentum_delta
            self.momentum1 =param_group['momentum']
            self.lr1 = param_group['lr']
        with open(os.path.join(self.pars['results_path']+ self.name,'hyper-{}.json').format(i_episode), 'w') as outfile:
            json.dump({'lr':self.lr, 'momentum':self.momentum, 'alpha':self.alpha,
                       'lr1':self.lr1, 'momentum1':self.momentum1,'eps_decay':self.EPS_DECAY,
                       'eps_entropy':self.eps_threshold,
                       'prob':self.prob,'i_episode':i_episode}, outfile)
    def clone(self, agent):
        state_dict = agent.policy_net.state_dict()
        self.policy_net.load_state_dict(state_dict)
        state_dict = agent.policy_optimizer.state_dict()
        self.policy_optimizer.load_state_dict(state_dict)
        self.alpha = agent.alpha
        state_dict = agent.q_net.state_dict()
        self.q_net.load_state_dict(state_dict)
        state_dict = agent.optimizer.state_dict()
        self.optimizer.load_state_dict(state_dict)
        self.target_net.load_state_dict(self.q_net.state_dict())
        self.EPS_DECAY = agent.EPS_DECAY
        self.eps_threshold = agent.eps_threshold
        self.prob = agent.prob