Beispiel #1
0
def accumulateMemory(memory, env, model, preprocess, epsstart, epsend,
                     epsdecay):
    try:
        #exploration counter ;
        steps_done = [0]
        global MIN_MEMORY
        i = 0
        while len(memory) < MIN_MEMORY:
            bashlogger.info('Episode : {} : memory : {}/{}'.format(
                i, len(memory), memory.capacity))
            i += 1
            cumul_reward = 0.0
            last_screen = get_screen_reset(env, preprocess=preprocess)
            current_screen, reward, done, info = get_screen(
                env, env.action_space.sample(), preprocess=preprocess)
            state = current_screen - last_screen
            episode_buffer = []

            for t in count():
                model.eval()
                action = select_action(model,
                                       state,
                                       steps_done=steps_done,
                                       epsend=epsend,
                                       epsstart=epsstart,
                                       epsdecay=epsdecay)
                last_screen = current_screen
                current_screen, reward, done, info = get_screen(
                    env, action[0, 0], preprocess=preprocess)
                cumul_reward += reward

                reward = Tensor([reward])

                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = torch.zeros(current_screen.size())

                episode_buffer.append(
                    EXP(state, action, next_state, reward, done))

                state = next_state

                if done:
                    log = 'Episode duration : {}'.format(
                        t + 1) + '---' + ' Reward : {}'.format(cumul_reward)
                    bashlogger.info(log)
                    break

            #Let us add this episode_buffer to the replayBuffer :
            for el in episode_buffer:
                init_priority = memory.priority(torch.abs(el.reward).numpy())
                memory.add(el, init_priority)
            del episode_buffer

        bashlogger.info('MEMORY : initialization : complete.')
    except Exception as e:
        bashlogger.exception(e)
Beispiel #2
0
	def compute_returns(self,exp_buffer) :
		if self.algo == 'pg' :
			cumr = 0.0 
			for i in reversed(range(len(exp_buffer)) ) :
				cumr = exp_buffer[i].reward + self.gamma*cumr
				exp_buffer[i] = EXP(exp_buffer[i].state,exp_buffer[i].action,exp_buffer[i].next_state,cumr,exp_buffer[i].done)
			return exp_buffer
		elif self.algo == 'ddpg' :
			return exp_buffer
		else :
			return exp_buffer
Beispiel #3
0
	def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=200,k=4,strategy='future',singlegoal=False): 
		try :
			episode_durations = []
			episode_reward = []
			episode_loss = []
			global rendering
			global use_cuda
			global MAX_STEPS
			#exploration counter ;
			steps_done = [0]
			
			#Double Network initialization :
			savemodel(model,path+'.save')
			model_ = copy.deepcopy(model)
			hard_update(model_,model)
			model_.eval()
			
			if use_cuda :
				model_ = model_.cuda()
			

			accumulateMemory(memory,env,model,preprocess,epsstart=0.5,epsend=0.3,epsdecay=200,k=k,strategy=strategy)

			for i in range(num_episodes) :
				bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) )
				cumul_reward = 0.0
				last_screen = get_screen_reset(env,preprocess=preprocess)
				current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess )
				state = current_screen - last_screen
				
				episode_buffer = []
				meanfreq = 0
				episode_loss_buffer = []

				#periodically save model to model_:
				_counterT = (_counterT+1)%_updateT
				if _counterT == 0 :
					savemodel(model,path+'.save')
					loadmodel(model_,path+'.save')

				#HER : sample initial goal :
				if not singlegoal :
					init_goal = sample_init_goal(memory)
				else :
					init_goal = torch.zeros(current_screen.size())

				showcount = 0

				for t in count() :
					model.eval()
					
					#HER :
					stategoal = torch.cat( [state,init_goal], dim=1)

					#action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					action = select_action(model,stategoal,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					
					last_screen = current_screen
					current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess)
					cumul_reward += reward
					
					if rendering :
						if showcount >= 10 :
							showcount = 0
							render(current_screen)#env.render()
						else :
							showcount +=1

					if not done :
						next_state = current_screen -last_screen
					else :
						next_state = torch.zeros(current_screen.size())

					episode_buffer.append( EXP(state,action,next_state,reward,done) )

					state = next_state

					# OPTIMIZE MODEL :
					since = time.time()		
					lossnp = self.optimize_model(model,model_,memory,optimizer)
					if lossnp is not None :
						episode_loss_buffer.append(  np.mean(lossnp) )
					else :
						episode_loss_buffer.append(0)
						
					# SOFT UPDATE :
					soft_update(model_,model,self.TAU)
				
					elt = time.time() - since
					f = 1.0/elt
					meanfreq = (meanfreq*(t+1) + f)/(t+2)
					
					if done or t > MAX_STEPS:
						self.from_worker2model()
						'''
						nbrTrain = 200
						for it in range(nbrTrain) :
							since = time.time()
							lossnp = optimize_model(model,model_,memory,optimizer)
							if lossnp is not None :
								episode_loss_buffer.append(  np.mean(lossnp) )
							else :
								episode_loss_buffer.append(0)
							
							elt = time.time() - since
							f = 1.0/elt
							meanfreq = (meanfreq*(it+1) + f)/(it+2)
							#print('{} Hz ; {} seconds.'.format(f,elt) )
						'''	
						episode_durations.append(t+1)
						episode_reward.append(cumul_reward)
						meanloss = np.mean(episode_loss_buffer)
						episode_loss.append(meanloss)

						log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {}'.format(cumul_reward,meanloss) +'---'+' {}Hz'.format(meanfreq)
						bashlogger.info(log)
						if logger is not None :
							new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]}
							logger.append(new)

						if path is not None :
							# SAVE THE MAIN MODEL :
							self.model.lock()
							savemodel(self.model,path+'.save')
							self.model.unlock()
							bashlogger.info('Model saved : {}'.format(path) )
						#plot_durations()
						break

				#Let us add this episode_buffer to the replayBuffer :
				for itexp in range(len(episode_buffer)) :
					el = episode_buffer[itexp]
					#HER : reward with init_goal
					HERreward = reward_function(el.state,init_goal)
					reward = HERreward+el.reward
					
					#store this transition with init_goal :
					init_el = EXP( state=torch.cat( [el.state, init_goal], dim=1),
									action=el.action,
									next_state=torch.cat( [el.next_state, init_goal], dim=1),
									reward=reward,
									done=el.done
								)
					
					init_priority = memory.priority( torch.abs(init_el.reward).numpy() )
					
					memory.add(init_el,init_priority)

					#store for multiple goals :
					#1: sample new goal :
					goals = []
					for j in range(k) :
						goal = None
						if strategy == 'final' :
							goal = sample_goal(episode_buffer, strategy=strategy)
						elif strategy == 'future' :
							# watch out for the empty set...
							index = min(len(episode_buffer)-3,itexp)
							goal = sample_goal(episode_buffer, strategy=index)	
						goals.append(goal)
						

					#For each goal ...
					for goal in goals :
						#2: .. compute reward :
						goalreward = reward_function(el.state,goal)+el.reward
						#3: ... store this transistion with goal :
						goalel = EXP( state=torch.cat( [el.state, goal], dim=1),
										action=el.action,
										next_state = torch.cat( [el.next_state, goal], dim=1),
										reward = goalreward,
										done=el.done
									)
						
						init_priority = memory.priority( torch.abs(goalel.reward).numpy() )
						memory.add(goalel,init_priority)
						
					del el
					del goals
				

				del episode_buffer

			bashlogger.info('Complete')
			if path is not None :
				savemodel(model,path+'.save')
				bashlogger.info('Model saved : {}'.format(path) )
			
			env.close()
		except Exception as e :
			bashlogger.exception(e)
Beispiel #4
0
	def accumulateMemory(self,memory,env,model,preprocess,epsstart,epsend,epsdecay,k=4,strategy='future') :
		try :
			#exploration counter ;
			steps_done = [0]
			global MIN_MEMORY
			i = 0
			while len(memory) < MIN_MEMORY :
				bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) )
				i+=1
				cumul_reward = 0.0
				last_screen = get_screen_reset(env,preprocess=preprocess)
				current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess )
				state = current_screen - last_screen
				episode_buffer = []

				init_goal = state
				init_goal = torch.zeros(current_screen.size())
				'''
				if len(memory) :
					init_goal = sample_init_goal(memory)
				'''

				showcount = 0 

				for t in count() :
					model.eval()
					#HER :
					stategoal = torch.cat( [state,init_goal], dim=1)

					#action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					action = select_action(model,stategoal,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					last_screen = current_screen
					current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess)
					cumul_reward += reward

					if rendering :
						if showcount >= 10 :
							showcount = 0
							render(current_screen)#env.render()
						else :
							showcount +=1
					
					
					if not done :
						next_state = current_screen -last_screen
					else :
						next_state = torch.ones(current_screen.size())

					episode_buffer.append( EXP(state,action,next_state,reward,done) )

					state = next_state

					if done :
						log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {}'.format(cumul_reward)
						bashlogger.info(log)
						break


				#Let us add this episode_buffer to the replayBuffer :
				for itexp in range(len(episode_buffer)) :
					el = episode_buffer[itexp]
					#HER : reward with init_goal
					HERreward = reward_function(el.state,init_goal)
					reward = HERreward+el.reward
					
					#store this transition with init_goal :
					init_el = EXP( state=torch.cat( [el.state, init_goal], dim=1),
									action=el.action,
									next_state=torch.cat( [el.next_state, init_goal], dim=1),
									reward=reward,
									done=el.done
								)
					
					init_priority = memory.priority( torch.abs(init_el.reward).numpy() )
					
					memory.add(init_el,init_priority)

					#store for multiple goals :
					#1: sample new goal :
					goals = []
					for j in range(k) :
						goal = None
						if strategy == 'final' :
							goal = sample_goal(episode_buffer, strategy=strategy)
						elif strategy == 'future' :
							# watch out for the empty set...
							index = min(len(episode_buffer)-3,itexp)
							goal = sample_goal(episode_buffer, strategy=index)
						goals.append(goal)
						

					#For each goal ...
					for goal in goals :
						#2: .. compute reward :
						goalreward = reward_function(el.state,goal)+el.reward
						#3: ... store this transistion with goal :
						goalel = EXP( state=torch.cat( [el.state, goal], dim=1),
										action=el.action,
										next_state = torch.cat( [el.next_state, goal], dim=1),
										reward = goalreward,
										done=el.done
									)
						
						init_priority = memory.priority( torch.abs(goalel.reward).numpy() )
						memory.add(goalel,init_priority)
						
					del el
					del goals
				
				del episode_buffer

			bashlogger.info('MEMORY : initialization : complete.')
		except Exception as e :
			bashlogger.exception(e)
Beispiel #5
0
	def trainIN(self,index,model,env,memory,optimizers,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,HER={'use_her':True,'k':4,'strategy':'future','singlegoal':False},use_cuda=True,rendering=False): 
		try :
			episode_durations = []
			episode_reward = []
			episode_qsa = []
			episode_grad_actor = []
			episode_loss = []
			
			#accumulateMemory(memory,env,models,preprocess,epsstart=0.5,epsend=0.3,epsdecay=200,k=k,strategy=strategy)

			usehd = False
			if usehd :
				from utils.histogram import HistogramDebug
				hd = HistogramDebug()
				hd.setXlimit(-2.5,2.5)
			reward_scaler = 10.0

			for i in range(num_episodes) :
				bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) )
				
				cumul_reward = 0.0
				last_state = get_state_reset(env,preprocess=preprocess)
				state, reward, done, info = get_state(env,env.action_space.sample(),preprocess=preprocess )
				
				episode_buffer = []
				
				meanfreq = 0
				
				episode_qsa_buffer = []
				episode_closs_buffer = []
				episode_aloss_buffer = []
				episode_grad_actor_buffer = []
				action_buffer = []

				#HER : sample initial goal :
				if HER['use_her'] :
					#if not HER['singlegoal'] :
					#	init_goal = sample_init_goal(memory)
					#else :
					#	init_goal = torch.zeros(state.size())
					init_goal = torch.zeros(state.size())

				showcount = 0

				for t in count() :
					since = time.time()
					#HER :
					if HER['use_her'] :
						evalstate = torch.cat( [state,init_goal], dim=1)
					else :
						evalstate = state

					if i%5 == 0 :
						action = model.act(evalstate, exploitation=True)
					else :
						action = model.act(evalstate, exploitation=False)
					
					action_buffer.append(action )

					taction = torch.from_numpy(action.astype(np.float32))

					last_state = evalstate
					state, reward, done, info = get_state(env, action,preprocess=preprocess)
					
					reward /= reward_scaler

					cumul_reward += float(reward)
					treward = torch.from_numpy(reward.astype(np.float32))

					if rendering :
						if showcount >= 1 :
							showcount = 0
							#render(current_state)
							#plt.imshow(env.render(mode='rgb_array') )
							env.render()
						else :
							showcount +=1

					terminal = 0.0 
					if done :
						terminal = 1.0
					tterminal = terminal*torch.ones((1))
					episode_buffer.append( EXP(last_state,taction,state,treward,tterminal) )

					episode_qsa_buffer.append( model.evaluate(evalstate, taction) )

					#Optimize model :
					retloss = model.optimize(optimizer_critic=optimizers['critic'],optimizer_actor=optimizers['actor'])
					if retloss is not None :
						critic_loss,actor_loss, actor_grad = retloss
						episode_closs_buffer.append(  np.mean(critic_loss) )
						episode_aloss_buffer.append(  np.mean(actor_loss) )
						episode_grad_actor_buffer.append(  actor_grad )
					else :
						episode_closs_buffer.append(0)
						episode_aloss_buffer.append(0)
						episode_grad_actor_buffer.append(0)

					
					elt = time.time() - since
					f = 1.0/elt
					meanfreq = (meanfreq*(t+1) + f)/(t+2)
					#print('{} Hz ; {} seconds.'.format(f,elt) )
					
					if done or t> MAX_STEP:
						episode_durations.append(t+1)
						episode_reward.append(cumul_reward)
						meancloss = np.mean(episode_closs_buffer)
						meanaloss = np.mean(episode_aloss_buffer)
						episode_loss.append(meancloss)
						meanqsa = np.mean(episode_qsa_buffer) 
						maxqsa = np.max(episode_qsa_buffer) 
						episode_qsa.append( meanqsa)
						meanactorgrad = np.max(episode_grad_actor_buffer) 
						episode_grad_actor.append( meanactorgrad)
						meanaction = np.mean(action_buffer)
						sigmaaction = np.std(action_buffer)
						action_buffer = np.array(action_buffer).reshape((-1))
						
						if usehd : hd.append(np.array(action_buffer) )

						log = 'Episode duration : {}'.format(t+1) +'---' +' Action : mu:{:.4f} sig:{:.4f} // Reward : {} // Mean C/A Losses : {:.4f}/{:.4f} // Mean/MaxQsa : {:.4f}/{:.4f} // Mean Actor Grad : {:.8f}'.format(meanaction,sigmaaction,cumul_reward,meancloss,meanaloss,meanqsa,maxqsa,meanactorgrad) +'---'+' {}Hz'.format(meanfreq)
						if i%5 == 0:
							log = 'EVAL :: ' + log
						bashlogger.info(log)
						if logger is not None :
							new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'critic loss':[meancloss],'actor loss':[meanaloss],'max qsa':[maxqsa],'mean qsa':[meanqsa],'mean action':[meanaction]}
							logger.append(new)

						if path is not None :
							model.save(path+'.save')
							bashlogger.info('Model saved : {}'.format(path) )
						break

				#compute returns :
				episode_buffer = model.compute_returns(episode_buffer)

				#Let us add this episode_buffer to the replayBuffer :
				if HER['use_her'] :
					for itexp in range(len(episode_buffer)) :
						el = episode_buffer[itexp]
						#HER : reward with init_goal
						HERreward = reward_function(el.state,init_goal)
						reward = HERreward+el.reward
						
						#store this transition with init_goal :
						init_el = EXP( state=torch.cat( [el.state, init_goal], dim=1),
										action=el.action,
										next_state=torch.cat( [el.next_state, init_goal], dim=1),
										reward=reward,
										done=el.done
									)
						
						init_priority = memory.priority( torch.abs(init_el.reward).numpy() )
						
						memory.add(init_el,init_priority)

						#store for multiple goals :
						#1: sample new goal :
						goals = []
						for j in range(HER['k']) :
							goal = None
							if HER['strategy'] == 'final' :
								goal = sample_goal(episode_buffer, strategy=HER['strategy'])
							elif HER['strategy'] == 'future' :
								# watch out for the empty set...
								index = min(len(episode_buffer)-3,itexp)
								goal = sample_goal(episode_buffer, strategy=index)	
							goals.append(goal)
							

						#For each goal ...
						for goal in goals :
							#2: .. compute reward :
							goalreward = reward_function(el.state,goal)+el.reward
							#3: ... store this transistion with goal :
							goalel = EXP( state=torch.cat( [el.state, goal], dim=1),
											action=el.action,
											next_state = torch.cat( [el.next_state, goal], dim=1),
											reward = goalreward,
											done=el.done
										)
							
							init_priority = memory.priority( torch.abs(goalel.reward).numpy() )
							memory.add(goalel,init_priority)
							
						del el
						del goals
				else :
					if isinstance( memory, PrioritizedReplayBuffer) :
						for el in episode_buffer :
							#store this transition 
							init_priority = memory.priority( torch.abs(el.reward).numpy() )
							memory.add(el,init_priority)
					else :
						for el in episode_buffer :
							#store this transition 
							memory.add(el)

				del episode_buffer
				# check memory consumption and clear memory
				gc.collect()

			bashlogger.info('Learning complete.')
			if path is not None :
				savemodel(model,path+'.save')
				bashlogger.info('Model saved : {}'.format(path) )
			
			env.close()
		
		except Exception as e :
			bashlogger.exception(e)
Beispiel #6
0
def train(model,
          env,
          memory,
          optimizer,
          preprocess=T.ToTensor(),
          path=None,
          frompath=None,
          num_episodes=1000,
          epsend=0.05,
          epsstart=0.9,
          epsdecay=200):
    episode_durations = []
    episode_reward = []
    global rendering
    #exploration counter ;
    global steps_done
    steps_done = 0

    if frompath is not None:
        loadmodel(model, frompath)
        print('Model loaded: {}'.format(frompath))

    for i in range(num_episodes):
        print('Episode : {} : memory : {}/{}'.format(i, len(memory),
                                                     memory.capacity))
        cumul_reward = 0.0
        last_screen = get_screen_reset(env, preprocess=preprocess)
        current_screen, reward, done, info = get_screen(
            env, env.action_space.sample(), preprocess=preprocess)
        state = current_screen - last_screen

        episode_buffer = []

        for t in count():
            action = select_action(model,
                                   state,
                                   epsend=epsend,
                                   epsstart=epsstart,
                                   epsdecay=epsdecay)
            last_screen = current_screen
            current_screen, reward, done, info = get_screen(
                env, action[0, 0], preprocess=preprocess)
            cumul_reward += reward

            if rendering:
                env.render()
            reward = Tensor([reward])

            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = torch.zeros(current_screen.size())

            episode_buffer.append(EXP(state, action, next_state, reward, done))

            state = next_state

            since = time.time()
            optimize_model(model, memory, optimizer)
            elt = time.time() - since
            f = 1.0 / elt
            #print('{} Hz ; {} seconds.'.format(f,elt) )

            if done:
                episode_durations.append(t + 1)
                episode_reward.append(cumul_reward)
                print('Epoch duration : {}'.format(t + 1))
                print('Cumulative Reward : {}'.format(cumul_reward))
                if path is not None:
                    savemodel(model, path)
                    print('Model saved : {}'.format(path))
                #plot_durations()
                break

        #Let us add this episode_buffer to the replayBuffer :
        for el in episode_buffer:
            init_priority = memory.priority(torch.abs(el.reward).numpy())
            memory.add(el, init_priority)
        del episode_buffer

    print('Complete')
    if path is not None:
        savemodel(model, path)
        print('Model saved : {}'.format(path))

    env.close()
    plt.ioff()
    plt.show()
def train(model,
          env,
          memory,
          optimizer,
          logger=None,
          preprocess=T.ToTensor(),
          path=None,
          frompath=None,
          num_episodes=1000,
          epsend=0.05,
          epsstart=0.9,
          epsdecay=200):
    try:
        episode_durations = []
        episode_reward = []
        episode_loss = []
        global rendering
        #exploration counter ;
        steps_done = [0]

        accumulateMemory(memory,
                         env,
                         model,
                         preprocess,
                         epsstart=0.99,
                         epsend=0.9,
                         epsdecay=200)

        for i in range(num_episodes):
            bashlogger.info('Episode : {} : memory : {}/{}'.format(
                i, len(memory), memory.capacity))
            cumul_reward = 0.0
            last_screen = get_screen_reset(env, preprocess=preprocess)
            current_screen, reward, done, info = get_screen(
                env, env.action_space.sample(), preprocess=preprocess)
            state = current_screen - last_screen

            episode_buffer = []
            meanfreq = 0
            episode_loss_buffer = []

            for t in count():
                model.eval()
                action = select_action(model,
                                       state,
                                       steps_done=steps_done,
                                       epsend=epsend,
                                       epsstart=epsstart,
                                       epsdecay=epsdecay)
                last_screen = current_screen
                current_screen, reward, done, info = get_screen(
                    env, action[0, 0], preprocess=preprocess)
                cumul_reward += reward

                if rendering:
                    env.render()
                reward = Tensor([reward])

                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = torch.zeros(current_screen.size())

                episode_buffer.append(
                    EXP(state, action, next_state, reward, done))

                state = next_state

                since = time.time()
                lossnp = optimize_model(model, memory, optimizer)
                if lossnp is not None:
                    episode_loss_buffer.append(np.mean(lossnp))
                else:
                    episode_loss_buffer.append(0)

                elt = time.time() - since
                f = 1.0 / elt
                meanfreq = (meanfreq * (t + 1) + f) / (t + 2)
                #print('{} Hz ; {} seconds.'.format(f,elt) )

                if done:
                    episode_durations.append(t + 1)
                    episode_reward.append(cumul_reward)
                    meanloss = np.mean(episode_loss_buffer)
                    episode_loss.append(meanloss)

                    log = 'Episode duration : {}'.format(
                        t +
                        1) + '---' + ' Reward : {} // Mean Loss : {}'.format(
                            cumul_reward,
                            meanloss) + '---' + ' {}Hz'.format(meanfreq)
                    bashlogger.info(log)
                    if logger is not None:
                        new = {
                            'episodes': [i],
                            'duration': [t + 1],
                            'reward': [cumul_reward],
                            'mean frequency': [meanfreq],
                            'loss': [meanloss]
                        }
                        logger.append(new)

                    if path is not None:
                        savemodel(model, path + '.save')
                        bashlogger.info('Model saved : {}'.format(path))
                    #plot_durations()
                    break

            #Let us add this episode_buffer to the replayBuffer :
            for el in episode_buffer:
                init_priority = memory.priority(torch.abs(el.reward).numpy())
                memory.add(el, init_priority)
            del episode_buffer

        bashlogger.info('Complete')
        if path is not None:
            savemodel(model, path)
            bashlogger.info('Model saved : {}'.format(path))

        env.close()
    except Exception as e:
        bashlogger.exception(e)
Beispiel #8
0
	def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10): 
		try :
			episode_durations = []
			episode_reward = []
			episode_loss = []
			global rendering
			global use_cuda
			global MAX_STEPS
			#exploration counter ;
			steps_done = [0]
			
			#Double Network initialization :
			savemodel(model,path+'.save')
			#model_ = DuelingDQN(model.nbr_actions)
			model_ = copy.deepcopy(model)
			hard_update(model_,model)
			model_.eval()
			
			if use_cuda :
				model_ = model_.cuda()
				
			for i in range(num_episodes) :
				bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) )
				cumul_reward = 0.0
				last_screen = get_screen_reset(env,preprocess=preprocess)
				current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess )
				state = current_screen - last_screen
				
				episode_buffer = []
				meanfreq = 0
				episode_loss_buffer = []
				episode_qsa_buffer = []

				
				showcount = 0
				for t in count() :
					
					action,qsa = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					episode_qsa_buffer.append(qsa)
					last_screen = current_screen
					current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess)
					cumul_reward += reward

					if rendering :
						if showcount >= 10 :
							showcount = 0
							render(current_screen)#env.render()
						else :
							showcount +=1
					
					reward = FloatTensor([reward])

					if not done :
						next_state = current_screen -last_screen
					else :
						next_state = torch.zeros(current_screen.size())

					episode_buffer.append( EXP(state,action,next_state,reward,done) )

					state = next_state

					# OPTIMIZE MODEL :
					since = time.time()		
					lossnp = self.optimize_model(model,model_,memory,optimizer)
					if lossnp is not None :
						episode_loss_buffer.append(  np.mean(lossnp) )
					else :
						episode_loss_buffer.append(0)
						
					# SOFT UPDATE :
					soft_update(model_,model,self.TAU)
				
					elt = time.time() - since
					f = 1.0/elt
					meanfreq = (meanfreq*(t+1) + f)/(t+2)
					
					if done or t > MAX_STEPS:
						self.from_worker2model()

						'''
						nbrTrain = 2
						for tr in range(nbrTrain) :
							since = time.time()		
							lossnp = optimize_model(model,model_,memory,optimizer)
							if lossnp is not None :
								episode_loss_buffer.append(  np.mean(lossnp) )
							else :
								episode_loss_buffer.append(0)
								
							elt = time.time() - since
							f = 1.0/elt
							meanfreq = (meanfreq*(tr+1) + f)/(tr+2)
							#print('{} Hz ; {} seconds.'.format(f,elt) )
						'''	
						episode_durations.append(t+1)
						episode_reward.append(cumul_reward)
						meanloss = np.mean(episode_loss_buffer)
						episode_loss.append(meanloss)
						meanqsa = np.mean(episode_qsa_buffer)


						log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {} // QSA : {}'.format(cumul_reward,meanloss,meanqsa) +'---'+' {}Hz'.format(meanfreq)
						bashlogger.info(log)
						if logger is not None :
							new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]}
							logger.append(new)

						if path is not None :
							# SAVE THE MAIN MODEL :
							self.model.lock()
							savemodel(self.model,path+'.save')
							self.model.unlock()
							bashlogger.info('Model saved : {}'.format(path) )
						#plot_durations()
						break


				#Let us add this episode_buffer to the replayBuffer :
				for el in episode_buffer :
					init_priority = memory.priority( torch.abs(el.reward).cpu().numpy() )
					memory.add(el,init_priority)
				del episode_buffer

			bashlogger.info('Complete')
			if path is not None :
				savemodel(model,path+'.save')
				bashlogger.info('Model saved : {}'.format(path) )
			
			env.close()
		except Exception as e :
			bashlogger.exception(e)