コード例 #1
0
    def train(self, total_step_counter, iter_fit=32):
        losses = []

        for i in range(iter_fit):
            data = self.buffer.sample(batch_size=self._config['batch_size'])
            s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device)

            s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device)
            a = torch.FloatTensor(np.stack(
                data[:, 1])[:, None]).squeeze(dim=1).to(self.device)

            rew = torch.FloatTensor(np.stack(
                data[:, 2])[:, None]).squeeze(dim=1).to(self.device)

            done = torch.FloatTensor(np.stack(
                data[:, 4])[:,
                            None]).squeeze(dim=1).to(self.device)  # done flag

            noise = torch.FloatTensor(a.cpu()).data.normal_(
                0, self._config['noise']).to(self.device)
            noise = noise.clamp(-self._config['noise_clip'],
                                self._config['noise_clip'])
            a_next = (self.actor_target(s_next).to(self.device) + noise).clamp(
                -1, 1)

            Q1_target, Q2_target = self.critics_target(s_next, a_next)
            target_Q = torch.min(Q1_target,
                                 Q2_target).squeeze(dim=1).to(self.device)

            # target

            targets = rew + self._config['discount'] * target_Q * (1.0 - done)

            # optimize critic
            targets = targets.to(self.device)
            Q1_current, Q2_current = self.critics(s, a)
            Q1_current = Q1_current.squeeze(dim=1).to(self.device)
            Q2_current = Q2_current.squeeze(dim=1).to(self.device)
            critic_loss = F.mse_loss(Q1_current, targets) + F.mse_loss(
                Q2_current, targets)

            losses.append(critic_loss)
            self.critics.optimizer.zero_grad()
            critic_loss.backward()
            self.critics.optimizer.step()

            if ((total_step_counter - 1) * iter_fit + i +
                    1) % self._config['update_target_every'] == 0:
                # optimize actor
                actions = self.actor.forward(s)
                actor_loss = -self.critics.Q1(s, actions).mean()
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()
                # update

                soft_update(self.critics_target, self.critics, self._tau)
                soft_update(self.actor_target, self.actor, self._tau)

        return losses
コード例 #2
0
    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)

        qf1, qf2 = self.critic(state_batch, action_batch)  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone() # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs


        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
コード例 #3
0
    def train(self, total_step_counter, iter_fit=32):
        losses = []

        for i in range(iter_fit):
            data = self.buffer.sample(batch_size=self._config['batch_size'])
            s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device)

            s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device)
            a = torch.FloatTensor(np.stack(
                data[:, 1])[:, None]).squeeze(dim=1).to(self.device)

            rew = torch.FloatTensor(np.stack(
                data[:, 2])[:, None]).squeeze(dim=1).to(self.device)

            done = torch.FloatTensor(np.stack(
                data[:, 4])[:,
                            None]).squeeze(dim=1).to(self.device)  # done flag

            Q_target = self.critic(s, a).squeeze(dim=1).to(self.device)
            a_next = self.actor_target.forward(s_next)
            Q_next = self.critic_target.forward(
                s_next, a_next).squeeze(dim=1).to(self.device)
            # target
            targets = rew + self._config['discount'] * Q_next * (1.0 - done)

            # optimize critic
            targets = targets.to(self.device)

            critic_loss = self.critic.loss(Q_target.float(), targets.float())
            losses.append(critic_loss)
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()
            actions = self.actor.forward(s)
            actor_loss = -self.critic.forward(s, actions).mean()
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            # update

            if (total_step_counter) % self._config['update_target_every'] == 0:
                # optimize actor
                soft_update(self.critic_target, self.critic, self._tau)
                soft_update(self.actor_target, self.actor, self._tau)

        return losses
コード例 #4
0
ファイル: agent.py プロジェクト: bwosh/DRL_ContinuousControl
    def step(self, state, action, reward, next_state, done):
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        self.step_idx += 1
        is_learn_iteration = (self.step_idx % self.opts.learn_every) == 0
        is_update_iteration = (self.step_idx % self.opts.update_every) == 0

        if len(self.memory) > self.opts.batch_size:
            if is_learn_iteration:
                experiences = self.memory.sample()
                self.learn(experiences, self.opts.gamma)

            if is_update_iteration:
                soft_update(self.critic_local, self.critic_target,
                            self.opts.tau)
                soft_update(self.actor_local, self.actor_target, self.opts.tau)
コード例 #5
0
    def train(self,
              model,
              env,
              memory,
              optimizer,
              logger=None,
              preprocess=T.ToTensor(),
              path=None,
              frompath=None,
              num_episodes=1000,
              epsend=0.05,
              epsstart=0.9,
              epsdecay=200,
              k=4,
              strategy='future',
              singlegoal=False):
        try:
            episode_durations = []
            episode_reward = []
            episode_loss = []
            global rendering
            global use_cuda
            global MAX_STEPS
            #exploration counter ;
            steps_done = [0]

            #Double Network initialization :
            savemodel(model, path + '.save')
            model_ = copy.deepcopy(model)
            hard_update(model_, model)
            model_.eval()

            if use_cuda:
                model_ = model_.cuda()

            self.accumulateMemory(memory,
                                  env,
                                  model,
                                  preprocess,
                                  epsstart=0.5,
                                  epsend=0.3,
                                  epsdecay=200,
                                  k=k,
                                  strategy=strategy)

            for i in range(num_episodes):
                bashlogger.info('Episode : {} : memory : {}/{}'.format(
                    i, len(memory), memory.capacity))
                cumul_reward = 0.0
                last_screen = get_screen_reset(env, preprocess=preprocess)
                current_screen, reward, done, info = get_screen(
                    env, env.action_space.sample(), preprocess=preprocess)
                state = current_screen - last_screen

                episode_buffer = []
                meanfreq = 0
                episode_loss_buffer = []

                #HER : sample initial goal :
                if not singlegoal:
                    init_goal = sample_init_goal(memory)
                else:
                    init_goal = torch.zeros(current_screen.size())

                showcount = 0

                for t in count():
                    #model.eval()

                    #HER :
                    stategoal = torch.cat([state, init_goal], dim=1)

                    #action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
                    action = select_action(model,
                                           stategoal,
                                           steps_done=steps_done,
                                           epsend=epsend,
                                           epsstart=epsstart,
                                           epsdecay=epsdecay)

                    last_screen = current_screen
                    current_screen, reward, done, info = get_screen(
                        env, action[0, 0], preprocess=preprocess)
                    cumul_reward += reward

                    if rendering:
                        if showcount >= 10:
                            showcount = 0
                            render(current_screen)  #env.render()
                        else:
                            showcount += 1

                    if not done:
                        next_state = current_screen - last_screen
                    else:
                        next_state = torch.zeros(current_screen.size())

                    episode_buffer.append(
                        EXP(state, action, next_state, reward, done))

                    state = next_state

                    # OPTIMIZE MODEL :
                    since = time.time()
                    lossnp = self.optimize_model(model, model_, memory,
                                                 optimizer)
                    if lossnp is not None:
                        episode_loss_buffer.append(np.mean(lossnp))
                    else:
                        episode_loss_buffer.append(0)

                    # SOFT UPDATE :
                    soft_update(model_, model, self.TAU)

                    elt = time.time() - since
                    f = 1.0 / elt
                    meanfreq = (meanfreq * (t + 1) + f) / (t + 2)

                    if done or t > MAX_STEPS:
                        self.from_worker2model()
                        '''
						nbrTrain = 200
						for it in range(nbrTrain) :
							since = time.time()
							lossnp = optimize_model(model,model_,memory,optimizer)
							if lossnp is not None :
								episode_loss_buffer.append(  np.mean(lossnp) )
							else :
								episode_loss_buffer.append(0)
							
							elt = time.time() - since
							f = 1.0/elt
							meanfreq = (meanfreq*(it+1) + f)/(it+2)
							#print('{} Hz ; {} seconds.'.format(f,elt) )
						'''
                        episode_durations.append(t + 1)
                        episode_reward.append(cumul_reward)
                        meanloss = np.mean(episode_loss_buffer)
                        episode_loss.append(meanloss)

                        log = 'Episode duration : {}'.format(
                            t + 1
                        ) + '---' + ' Reward : {} // Mean Loss : {}'.format(
                            cumul_reward,
                            meanloss) + '---' + ' {}Hz'.format(meanfreq)
                        bashlogger.info(log)
                        if logger is not None:
                            new = {
                                'episodes': [i],
                                'duration': [t + 1],
                                'reward': [cumul_reward],
                                'mean frequency': [meanfreq],
                                'loss': [meanloss]
                            }
                            logger.append(new)

                        if path is not None:
                            # SAVE THE MAIN MODEL :
                            self.model.lock()
                            savemodel(self.model, path + '.save')
                            self.model.unlock()
                            bashlogger.info('Model saved : {}'.format(path))
                        #plot_durations()
                        break

                #Let us add this episode_buffer to the replayBuffer :
                for itexp in range(len(episode_buffer)):
                    el = episode_buffer[itexp]
                    #HER : reward with init_goal
                    HERreward = reward_function(el.state, init_goal)
                    reward = HERreward + el.reward

                    #store this transition with init_goal :
                    init_el = EXP(
                        state=torch.cat([el.state, init_goal], dim=1),
                        action=el.action,
                        next_state=torch.cat([el.next_state, init_goal],
                                             dim=1),
                        reward=reward,
                        done=el.done)

                    init_priority = memory.priority(
                        torch.abs(init_el.reward).numpy())

                    memory.add(init_el, init_priority)

                    #store for multiple goals :
                    #1: sample new goal :
                    goals = []
                    for j in range(k):
                        goal = None
                        if strategy == 'final':
                            goal = sample_goal(episode_buffer,
                                               strategy=strategy)
                        elif strategy == 'future':
                            # watch out for the empty set...
                            index = min(len(episode_buffer) - 3, itexp)
                            goal = sample_goal(episode_buffer, strategy=index)
                        goals.append(goal)

                    #For each goal ...
                    for goal in goals:
                        #2: .. compute reward :
                        goalreward = reward_function(el.state,
                                                     goal) + el.reward
                        #3: ... store this transistion with goal :
                        goalel = EXP(state=torch.cat([el.state, goal], dim=1),
                                     action=el.action,
                                     next_state=torch.cat(
                                         [el.next_state, goal], dim=1),
                                     reward=goalreward,
                                     done=el.done)

                        init_priority = memory.priority(
                            torch.abs(goalel.reward).numpy())
                        memory.add(goalel, init_priority)

                    del el
                    del goals

                del episode_buffer

            bashlogger.info('Complete')
            if path is not None:
                savemodel(model, path + '.save')
                bashlogger.info('Model saved : {}'.format(path))

            env.close()
        except Exception as e:
            bashlogger.exception(e)
コード例 #6
0
	def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10): 
		try :
			episode_durations = []
			episode_reward = []
			episode_loss = []
			global rendering
			global use_cuda
			global MAX_STEPS
			#exploration counter ;
			steps_done = [0]
			
			#Double Network initialization :
			savemodel(model,path+'.save')
			#model_ = DuelingDQN(model.nbr_actions)
			model_ = copy.deepcopy(model)
			hard_update(model_,model)
			model_.eval()
			
			if use_cuda :
				model_ = model_.cuda()
				
			for i in range(num_episodes) :
				bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) )
				cumul_reward = 0.0
				last_screen = get_screen_reset(env,preprocess=preprocess)
				current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess )
				state = current_screen - last_screen
				
				episode_buffer = []
				meanfreq = 0
				episode_loss_buffer = []
				episode_qsa_buffer = []

				
				showcount = 0
				for t in count() :
					
					action,qsa = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					episode_qsa_buffer.append(qsa)
					last_screen = current_screen
					current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess)
					cumul_reward += reward

					if rendering :
						if showcount >= 10 :
							showcount = 0
							render(current_screen)#env.render()
						else :
							showcount +=1
					
					reward = FloatTensor([reward])

					if not done :
						next_state = current_screen -last_screen
					else :
						next_state = torch.zeros(current_screen.size())

					episode_buffer.append( EXP(state,action,next_state,reward,done) )

					state = next_state

					# OPTIMIZE MODEL :
					since = time.time()		
					lossnp = self.optimize_model(model,model_,memory,optimizer)
					if lossnp is not None :
						episode_loss_buffer.append(  np.mean(lossnp) )
					else :
						episode_loss_buffer.append(0)
						
					# SOFT UPDATE :
					soft_update(model_,model,self.TAU)
				
					elt = time.time() - since
					f = 1.0/elt
					meanfreq = (meanfreq*(t+1) + f)/(t+2)
					
					if done or t > MAX_STEPS:
						self.from_worker2model()

						'''
						nbrTrain = 2
						for tr in range(nbrTrain) :
							since = time.time()		
							lossnp = optimize_model(model,model_,memory,optimizer)
							if lossnp is not None :
								episode_loss_buffer.append(  np.mean(lossnp) )
							else :
								episode_loss_buffer.append(0)
								
							elt = time.time() - since
							f = 1.0/elt
							meanfreq = (meanfreq*(tr+1) + f)/(tr+2)
							#print('{} Hz ; {} seconds.'.format(f,elt) )
						'''	
						episode_durations.append(t+1)
						episode_reward.append(cumul_reward)
						meanloss = np.mean(episode_loss_buffer)
						episode_loss.append(meanloss)
						meanqsa = np.mean(episode_qsa_buffer)


						log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {} // QSA : {}'.format(cumul_reward,meanloss,meanqsa) +'---'+' {}Hz'.format(meanfreq)
						bashlogger.info(log)
						if logger is not None :
							new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]}
							logger.append(new)

						if path is not None :
							# SAVE THE MAIN MODEL :
							self.model.lock()
							savemodel(self.model,path+'.save')
							self.model.unlock()
							bashlogger.info('Model saved : {}'.format(path) )
						#plot_durations()
						break


				#Let us add this episode_buffer to the replayBuffer :
				for el in episode_buffer :
					init_priority = memory.priority( torch.abs(el.reward).cpu().numpy() )
					memory.add(el,init_priority)
				del episode_buffer

			bashlogger.info('Complete')
			if path is not None :
				savemodel(model,path+'.save')
				bashlogger.info('Model saved : {}'.format(path) )
			
			env.close()
		except Exception as e :
			bashlogger.exception(e)
コード例 #7
0
    def update_parameters(self, total_step):
        data = self.buffer.sample(self._config['batch_size'])

        state = torch.FloatTensor(np.stack(data[:, 0]), device=self.device)

        next_state = torch.FloatTensor(np.stack(data[:, 3]),
                                       device=self.device)

        action = torch.FloatTensor(np.stack(data[:, 1])[:, None],
                                   device=self.device).squeeze(dim=1)

        reward = torch.FloatTensor(np.stack(data[:, 2])[:, None],
                                   device=self.device).squeeze(dim=1)

        not_done = torch.FloatTensor(
            (~np.stack(data[:, 4])[:, None]).astype(np.int),
            device=self.device).squeeze(dim=1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _, _ = self.actor.sample(
                next_state)
            q1_next_targ, q2_next_targ = self.critic_target(
                next_state, next_state_action)

            min_qf_next_target = torch.min(
                q1_next_targ, q2_next_targ) - self.alpha * next_state_log_pi
            next_q_value = reward + not_done * self._config['gamma'] * (
                min_qf_next_target).squeeze()

        qf1, qf2 = self.critic(state, action)

        qf1_loss = self.critic.loss(qf1.squeeze(), next_q_value)
        qf2_loss = self.critic.loss(qf2.squeeze(), next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.critic.optimizer.zero_grad()
        qf_loss.backward()
        self.critic.optimizer.step()

        pi, log_pi, _, _ = self.actor.sample(state)

        qf1_pi, qf2_pi = self.critic(state, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(axis=0)

        self.actor.optimizer.zero_grad()
        policy_loss.backward()
        self.actor.optimizer.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()
            self.alpha_scheduler.step()

            self.alpha = self.log_alpha.exp()
        else:
            alpha_loss = torch.tensor(0.).to(self.device)

        if total_step % self._config['update_target_every'] == 0:
            soft_update(self.critic_target, self.critic,
                        self._config['soft_tau'])

        return (qf1_loss.item(), qf2_loss.item(), policy_loss.item(),
                alpha_loss.item())
コード例 #8
0
ファイル: model.py プロジェクト: xuhuazhe/PYTORCH_RL
	def optimize(self,optimizer_critic,optimizer_actor) :
		'''
		self.target_critic.eval()
		self.target_actor.eval()
		self.critic.train()
		self.actor.train()
		'''

		if self.algo == 'ddpg' :
			try :
				if len(self.memory) < self.MIN_MEMORY :
					return
				
				#Create Batch 
				self.mutex.acquire()
				
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					#with PR :
					prioritysum = self.memory.total()
					randexp = np.random.random(size=self.batch_size)*prioritysum
					batch = list()
					for i in range(self.batch_size):
						try :
							el = self.memory.get(randexp[i])
							batch.append(el)
						except TypeError as e :
							continue
							#print('REPLAY BUFFER EXCEPTION...')
				else :
					#with random RB :
					batch = self.memory.sample(self.batch_size)

				self.mutex.release()

				if len(batch) == 0 :
					return

				# Create Batch with replayMemory :
				batch = TransitionPR( *zip(*batch) )
				next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False)
				state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False)
				action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False)
				reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1))
				terminal_batch = Variable( torch.cat( batch.done ) )
				
				'''
				next_state_batch = Variable(torch.cat( batch.next_state) )
				state_batch = Variable( torch.cat( batch.state) )
				action_batch = Variable( torch.cat( batch.action) )
				reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1))
				'''
				
				if self.use_cuda :
					next_state_batch = next_state_batch.cuda()
					state_batch = state_batch.cuda()
					action_batch = action_batch.cuda()
					reward_batch = reward_batch.cuda()
					terminal_batch = terminal_batch.cuda()

				
				# Critic :
				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				# sample action from next_state, without gradient repercusion :
				next_taction = self.target_actor(next_state_batch).detach()
				# evaluate the next state action over the target, without repercusion (faster...) :
				next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1))
				# Supervise loss :
				## y_true :
				y_true = reward_batch + (1.0-terminal_batch)*self.gamma*next_tqsa
				#print(torch.cat([y_true.view((-1,1)),terminal_batch.view((-1,1))],dim=1) )

				## y_pred :
				y_pred = torch.squeeze( self.critic(state_batch,action_batch) )
				## loss :
				critic_loss = F.smooth_l1_loss(y_pred,y_true)
				#criterion = nn.MSELoss()
				#critic_loss = criterion(y_pred,y_true)
				critic_loss.backward()
				
				#weight decay :
				decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.critic.parameters()])
				decay_loss.backward()

				#clamping :
				#torch.nn.utils.clip_grad_norm(self.critic.parameters(),50)				
				optimizer_critic.step()
				
				###################################
				'''
				
				# Actor :
				#predict action :
				pred_action = self.actor(state_batch)
				#predict associated qvalues :
				pred_qsa = self.critic(state_batch, pred_action)
				#pred_qsa = self.target_critic(state_batch, pred_action)
				
				# loss :
				actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) )
				
				#actor_loss = F.smooth_l1_loss( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() )
				
				#criterion = nn.MSELoss()
				#actor_loss = criterion( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() )
				
				#before optimization :
				optimizer_actor.zero_grad()
				self.actor.zero_grad()
				actor_loss.backward()
				#clamping :
				#clampactor = 1e2#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] )
				#torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor)				
				optimizer_actor.step()


				###################################
				'''
				###################################
				
				# Actor :
				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				
				#predict action :
				pred_action = self.actor(state_batch) 
				var_action = Variable( pred_action.cpu().data, requires_grad=True)
				if self.use_cuda :
					var_action_c = var_action.cuda()
					pred_qsa = self.critic(state_batch, var_action_c)
				else :
					pred_qsa = self.critic(state_batch, var_action)
				#predict associated qvalues :
				gradout = torch.ones(pred_qsa.size())
				if self.use_cuda:
					gradout = gradout.cuda()
				pred_qsa.backward( gradout )

				if self.use_cuda :
					gradcritic = var_action.grad.data.cuda()
				else :
					gradcritic = var_action.grad.data

				pred_action.backward( -gradcritic)
				
				#weight decay :
				decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.actor.parameters()])
				decay_loss.backward()

				#clamping :
				clampactor = 5e0#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] )
				torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor)				
				optimizer_actor.step()
				# loss :
				actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) )
				
				###################################

				
				'''
				critic_grad = 0.0
				for p in self.critic.parameters() :
					critic_grad += np.mean(p.grad.cpu().data.numpy())
				print( 'Mean Critic Grad : {}'.format(critic_grad) )
				'''
				
				actor_grad = 0.0
				for p in self.actor.parameters() :
					actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) )
				#print( 'Mean Actor Grad : {}'.format(actor_grad) )
				

				#UPDATE THE PR :
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					self.mutex.acquire()
					loss = torch.abs(actor_loss) + torch.abs(critic_loss)
					#loss = torch.abs(actor_loss) #+ torch.abs(critic_loss)
					loss_np = loss.cpu().data.numpy()
					for (idx, new_error) in zip(batch.idx,loss_np) :
						new_priority = self.memory.priority(new_error)
						#print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) )
						self.memory.update(idx,new_priority)
					self.mutex.release()

			except Exception as e :
				bashlogger.debug('error : {}',format(e) )
				raise e

			# soft update :
			soft_update(self.target_critic, self.critic, self.tau)
			soft_update(self.target_actor, self.actor, self.tau)

			
			closs = critic_loss.cpu()
			aloss = actor_loss.cpu()
			
			return closs.data.numpy(), aloss.data.numpy(), actor_grad

		elif self.algo == 'pddpg' :
			try :
				if len(self.memory) < self.MIN_MEMORY :
					return
				
				#Create Batch 
				self.mutex.acquire()
				
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					#with PR :
					prioritysum = self.memory.total()
					randexp = np.random.random(size=self.batch_size)*prioritysum
					batch = list()
					for i in range(self.batch_size):
						try :
							el = self.memory.get(randexp[i])
							batch.append(el)
						except TypeError as e :
							continue
							#print('REPLAY BUFFER EXCEPTION...')
				else :
					#with random RB :
					batch = self.memory.sample(self.batch_size)

				self.mutex.release()

				if len(batch) == 0 :
					return

				# Create Batch with replayMemory :
				batch = TransitionPR( *zip(*batch) )
				next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False)
				state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False)
				action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False)
				reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1))
				'''
				next_state_batch = Variable(torch.cat( batch.next_state) )
				state_batch = Variable( torch.cat( batch.state) )
				action_batch = Variable( torch.cat( batch.action) )
				reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1))
				'''
				
				if self.use_cuda :
					next_state_batch = next_state_batch.cuda()
					state_batch = state_batch.cuda()
					action_batch = action_batch.cuda()
					reward_batch = reward_batch.cuda()

				
				# Critic :
				# sample action from next_state, without gradient repercusion :
				next_taction = self.target_actor(next_state_batch).detach()
				# evaluate the next state action over the target, without repercusion (faster...) :
				next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1))
				# Supervise loss :
				## y_true :
				y_true = reward_batch + self.gamma*next_tqsa 
				## y_pred :
				y_pred = torch.squeeze( self.critic(state_batch,action_batch) )
				## loss :
				critic_loss = F.smooth_l1_loss(y_pred,y_true)
				#criterion = nn.MSELoss()
				#critic_loss = criterion(y_pred,y_true)
				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				critic_loss.backward()
				#clamping :
				#torch.nn.utils.clip_grad_norm(self.critic.parameters(),50)				
				optimizer_critic.step()
				
				
				###################################
				
				# Actor :
				#predict action with old weights :
				pred_old_action = self.previous_actor(state_batch)
				#predict action with current weights :
				pred_action = self.actor(state_batch) 
				
				var_action = Variable( pred_action.cpu().data, requires_grad=True)
				var_old_action = Variable( pred_old_action.cpu().data, requires_grad=True)
				
				if self.use_cuda :
					var_action_c = var_action.cuda()
					var_old_action_c = var_old_action.cuda()
					#predict associated qvalues :
					pred_qsa = self.critic(state_batch, var_action_c)
					pred_old_qsa = self.critic(state_batch, var_old_action_c)
				else :
					#predict associated qvalues :
					pred_qsa = self.critic(state_batch, var_action)
					pred_old_qsa = self.critic(state_batch, var_old_action)
				
				#helper vars :
				clipped_m = (1.0-self.epsilon)#*torch.ones(ratio.size())
				clipped_p = (1.0+self.epsilon)#*torch.ones(ratio.size())
				gradout = torch.ones(pred_qsa.size())
				if self.use_cuda:
					gradout = gradout.cuda()
					
				#compute ratios :
				ratio = pred_qsa/pred_old_qsa
				clipped_ratio = ratio.clamp(clipped_m,clipped_p)
				p_actor_loss = torch.min(ratio,clipped_ratio)
				p_actor_loss.backward( gradout )

				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				if self.use_cuda :
					gradcritic = var_action.grad.data.cuda()
					pred_action.backward( -gradcritic)
				else :
					pred_action.backward( -var_action.grad.data)
				#clamping :
				#clampactor = 5e1#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] )
				#torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor)				
				
				#proximal update, before the update of the weights :
				hard_update(self.previous_actor, self.actor)
				#update of the weights :
				optimizer_actor.step()
				# loss :
				actor_loss = -1.0*torch.mean( pred_qsa )
				
				###################################

				
				'''
				critic_grad = 0.0
				for p in self.critic.parameters() :
					critic_grad += np.mean(p.grad.cpu().data.numpy())
				print( 'Mean Critic Grad : {}'.format(critic_grad) )
				'''
				
				actor_grad = 0.0
				for p in self.actor.parameters() :
					actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) )
				#print( 'Mean Actor Grad : {}'.format(actor_grad) )
				

				#UPDATE THE PR :
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					self.mutex.acquire()
					loss = torch.abs(actor_loss) + torch.abs(critic_loss)
					#loss = torch.abs(actor_loss) #+ torch.abs(critic_loss)
					loss_np = loss.cpu().data.numpy()
					for (idx, new_error) in zip(batch.idx,loss_np) :
						new_priority = self.memory.priority(new_error)
						#print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) )
						self.memory.update(idx,new_priority)
					self.mutex.release()

			except Exception as e :
				bashlogger.debug('error : {}',format(e) )
				raise e

			# soft update :
			soft_update(self.target_critic, self.critic, self.tau)
			soft_update(self.target_actor, self.actor, self.tau)

			
			closs = critic_loss.cpu()
			aloss = actor_loss.cpu()
			
			return closs.data.numpy(), aloss.data.numpy(), actor_grad

		else :
			raise NotImplemented
コード例 #9
0
ファイル: model.py プロジェクト: xuhuazhe/PYTORCH_RL
	def optimize(self,MIN_MEMORY=1e3) :

		if self.algo == 'ddpg' :
			try :
				if len(self.memory) < MIN_MEMORY :
					return
				
				#Create Batch with PR :
				prioritysum = self.memory.total()
				randexp = np.random.random(size=self.batch_size)*prioritysum
				batch = list()
				for i in range(self.batch_size):
					try :
						el = self.memory.get(randexp[i])
						batch.append(el)
					except TypeError as e :
						continue
						#print('REPLAY BUFFER EXCEPTION...')
				
				# Create Batch with replayMemory :
				batch = TransitionPR( *zip(*batch) )
				next_state_batch = Variable(torch.cat( batch.next_state), requires_grad=False)
				state_batch = Variable( torch.cat( batch.state) , requires_grad=False)
				action_batch = Variable( torch.cat( batch.action) , requires_grad=False)
				reward_batch = Variable( torch.cat( batch.reward ), requires_grad=False ).view((-1,1))
				
				if self.use_cuda :
					next_state_batch = next_state_batch.cuda()
					state_batch = state_batch.cuda()
					action_batch = action_batch.cuda()
					reward_batch = reward_batch.cuda()

				#before optimization :
				self.optimizer.zero_grad()
			
				# Critic :
				# sample action from next_state, without gradient repercusion :
				next_taction = self.target_NN.actor(next_state_batch).detach()
				# evaluate the next state action over the target, without repercusion (faster...) :
				next_tqsa = torch.squeeze( self.target_NN.critic( next_state_batch, next_taction).detach() )
				# Supervise loss :
				## y_true :
				y_true = reward_batch + self.gamma*next_tqsa 
				## y_pred :
				y_pred = torch.squeeze( self.NN.critic(state_batch,action_batch) )
				## loss :
				critic_loss = F.smooth_l1_loss(y_pred,y_true)
				#critic_loss.backward()
				#self.optimizer.step()

				# Actor :
				pred_action = self.NN.actor(state_batch)
				pred_qsa = torch.squeeze( self.target_NN.critic(state_batch, pred_action) )
				# loss :
				actor_loss = -1.0*torch.sum( pred_qsa)
				#actor_loss.backward()
				#self.optimizer.step()

				# optimize both pathway :
				scalerA = 0.1
				scalerV = 10.0
				total_loss = scalerA*actor_loss + scalerV*critic_loss
				total_loss.backward()
				self.optimizer.step()

			except Exception as e :
				bashlogger.debug('error : {}',format(e) )
				

			# soft update :
			soft_update(self.target_NN, self.NN, self.tau)

			return critic_loss.cpu().data.numpy(), actor_loss.cpu().data.numpy()

		else :
			raise NotImplemented
コード例 #10
0
ファイル: model.py プロジェクト: xuhuazhe/PYTORCH_RL
	def optimize(self) :
		'''
		self.target_critic.eval()
		self.target_actor.eval()
		self.critic.train()
		self.actor.train()
		'''

		if self.algo == 'ddpg' :
			try :
				if len(self.memory) < self.MIN_MEMORY :
					return
				
				#Create Batch with PR :
				prioritysum = self.memory.total()
				randexp = np.random.random(size=self.batch_size)*prioritysum
				batch = list()
				for i in range(self.batch_size):
					try :
						el = self.memory.get(randexp[i])
						batch.append(el)
					except TypeError as e :
						continue
						#print('REPLAY BUFFER EXCEPTION...')
				
				# Create Batch with replayMemory :
				batch = TransitionPR( *zip(*batch) )
				next_state_batch = Variable(torch.cat( batch.next_state), requires_grad=False)
				state_batch = Variable( torch.cat( batch.state) , requires_grad=False)
				action_batch = Variable( torch.cat( batch.action) , requires_grad=False)
				reward_batch = Variable( torch.cat( batch.reward ), requires_grad=False ).view((-1))
				'''
				next_state_batch = Variable(torch.cat( batch.next_state) )
				state_batch = Variable( torch.cat( batch.state) )
				action_batch = Variable( torch.cat( batch.action) )
				reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1))
				'''
				
				if self.use_cuda :
					next_state_batch = next_state_batch.cuda()
					state_batch = state_batch.cuda()
					action_batch = action_batch.cuda()
					reward_batch = reward_batch.cuda()

				
				# Actor :
				#before optimization :
				self.optimizer_actor.zero_grad()
				#predict action :
				pred_action = self.actor(state_batch)
				#predict associated qvalues :
				pred_qsa = self.critic(state_batch, pred_action)
				# loss :
				actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) )
				actor_loss.backward()
				#clamping :
				torch.nn.utils.clip_grad_norm(self.actor.parameters(),0.05)				
				self.optimizer_actor.step()

				# Critic :
				#before optimization :
				self.optimizer_critic.zero_grad()
				# sample action from next_state, without gradient repercusion :
				next_taction = self.target_actor(next_state_batch).detach()
				# evaluate the next state action over the target, without repercusion (faster...) :
				next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1))
				# Supervise loss :
				## y_true :
				y_true = reward_batch + self.gamma*next_tqsa 
				## y_pred :
				y_pred = torch.squeeze( self.critic(state_batch,action_batch) )
				## loss :
				#critic_loss = F.smooth_l1_loss(y_pred,y_true)
				criterion = nn.MSELoss()
				critic_loss = criterion(y_pred,y_true)
				critic_loss.backward()
				#clamping :
				torch.nn.utils.clip_grad_norm(self.critic.parameters(),0.5)				
				self.optimizer_critic.step()
				
				'''
				critic_grad = 0.0
				for p in self.critic.parameters() :
					critic_grad += np.mean(p.grad.cpu().data.numpy())
				print( 'Mean Critic Grad : {}'.format(critic_grad) )
				'''
				
				actor_grad = 0.0
				for p in self.actor.parameters() :
					actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) )
				#print( 'Mean Actor Grad : {}'.format(actor_grad) )
				

				#UPDATE THE PR :
				loss = torch.abs(actor_loss) + torch.abs(critic_loss)
				#loss = torch.abs(actor_loss) #+ torch.abs(critic_loss)
				loss_np = loss.cpu().data.numpy()
				for (idx, new_error) in zip(batch.idx,loss_np) :
					new_priority = self.memory.priority(new_error)
					#print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) )
					self.memory.update(idx,new_priority)
			
			except Exception as e :
				bashlogger.debug('error : {}',format(e) )
				

			# soft update :
			soft_update(self.target_critic, self.critic, self.tau)
			soft_update(self.target_actor, self.actor, self.tau)

			del batch
			del next_state_batch 
			del state_batch 
			del action_batch 
			del reward_batch 

			closs = critic_loss.cpu()
			aloss = actor_loss.cpu()
			del actor_loss
			del critic_loss

			return closs.data.numpy(), aloss.data.numpy(), actor_grad

		else :
			raise NotImplemented