def accumulateMemory(memory, env, model, preprocess, epsstart, epsend, epsdecay): try: #exploration counter ; steps_done = [0] global MIN_MEMORY i = 0 while len(memory) < MIN_MEMORY: bashlogger.info('Episode : {} : memory : {}/{}'.format( i, len(memory), memory.capacity)) i += 1 cumul_reward = 0.0 last_screen = get_screen_reset(env, preprocess=preprocess) current_screen, reward, done, info = get_screen( env, env.action_space.sample(), preprocess=preprocess) state = current_screen - last_screen episode_buffer = [] for t in count(): model.eval() action = select_action(model, state, steps_done=steps_done, epsend=epsend, epsstart=epsstart, epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen( env, action[0, 0], preprocess=preprocess) cumul_reward += reward reward = Tensor([reward]) if not done: next_state = current_screen - last_screen else: next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state, action, next_state, reward, done)) state = next_state if done: log = 'Episode duration : {}'.format( t + 1) + '---' + ' Reward : {}'.format(cumul_reward) bashlogger.info(log) break #Let us add this episode_buffer to the replayBuffer : for el in episode_buffer: init_priority = memory.priority(torch.abs(el.reward).numpy()) memory.add(el, init_priority) del episode_buffer bashlogger.info('MEMORY : initialization : complete.') except Exception as e: bashlogger.exception(e)
def compute_returns(self,exp_buffer) : if self.algo == 'pg' : cumr = 0.0 for i in reversed(range(len(exp_buffer)) ) : cumr = exp_buffer[i].reward + self.gamma*cumr exp_buffer[i] = EXP(exp_buffer[i].state,exp_buffer[i].action,exp_buffer[i].next_state,cumr,exp_buffer[i].done) return exp_buffer elif self.algo == 'ddpg' : return exp_buffer else : return exp_buffer
def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=200,k=4,strategy='future',singlegoal=False): try : episode_durations = [] episode_reward = [] episode_loss = [] global rendering global use_cuda global MAX_STEPS #exploration counter ; steps_done = [0] #Double Network initialization : savemodel(model,path+'.save') model_ = copy.deepcopy(model) hard_update(model_,model) model_.eval() if use_cuda : model_ = model_.cuda() accumulateMemory(memory,env,model,preprocess,epsstart=0.5,epsend=0.3,epsdecay=200,k=k,strategy=strategy) for i in range(num_episodes) : bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) ) cumul_reward = 0.0 last_screen = get_screen_reset(env,preprocess=preprocess) current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess ) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] #periodically save model to model_: _counterT = (_counterT+1)%_updateT if _counterT == 0 : savemodel(model,path+'.save') loadmodel(model_,path+'.save') #HER : sample initial goal : if not singlegoal : init_goal = sample_init_goal(memory) else : init_goal = torch.zeros(current_screen.size()) showcount = 0 for t in count() : model.eval() #HER : stategoal = torch.cat( [state,init_goal], dim=1) #action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) action = select_action(model,stategoal,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess) cumul_reward += reward if rendering : if showcount >= 10 : showcount = 0 render(current_screen)#env.render() else : showcount +=1 if not done : next_state = current_screen -last_screen else : next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state,action,next_state,reward,done) ) state = next_state # OPTIMIZE MODEL : since = time.time() lossnp = self.optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) # SOFT UPDATE : soft_update(model_,model,self.TAU) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(t+1) + f)/(t+2) if done or t > MAX_STEPS: self.from_worker2model() ''' nbrTrain = 200 for it in range(nbrTrain) : since = time.time() lossnp = optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(it+1) + f)/(it+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) ''' episode_durations.append(t+1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {}'.format(cumul_reward,meanloss) +'---'+' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None : new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]} logger.append(new) if path is not None : # SAVE THE MAIN MODEL : self.model.lock() savemodel(self.model,path+'.save') self.model.unlock() bashlogger.info('Model saved : {}'.format(path) ) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for itexp in range(len(episode_buffer)) : el = episode_buffer[itexp] #HER : reward with init_goal HERreward = reward_function(el.state,init_goal) reward = HERreward+el.reward #store this transition with init_goal : init_el = EXP( state=torch.cat( [el.state, init_goal], dim=1), action=el.action, next_state=torch.cat( [el.next_state, init_goal], dim=1), reward=reward, done=el.done ) init_priority = memory.priority( torch.abs(init_el.reward).numpy() ) memory.add(init_el,init_priority) #store for multiple goals : #1: sample new goal : goals = [] for j in range(k) : goal = None if strategy == 'final' : goal = sample_goal(episode_buffer, strategy=strategy) elif strategy == 'future' : # watch out for the empty set... index = min(len(episode_buffer)-3,itexp) goal = sample_goal(episode_buffer, strategy=index) goals.append(goal) #For each goal ... for goal in goals : #2: .. compute reward : goalreward = reward_function(el.state,goal)+el.reward #3: ... store this transistion with goal : goalel = EXP( state=torch.cat( [el.state, goal], dim=1), action=el.action, next_state = torch.cat( [el.next_state, goal], dim=1), reward = goalreward, done=el.done ) init_priority = memory.priority( torch.abs(goalel.reward).numpy() ) memory.add(goalel,init_priority) del el del goals del episode_buffer bashlogger.info('Complete') if path is not None : savemodel(model,path+'.save') bashlogger.info('Model saved : {}'.format(path) ) env.close() except Exception as e : bashlogger.exception(e)
def accumulateMemory(self,memory,env,model,preprocess,epsstart,epsend,epsdecay,k=4,strategy='future') : try : #exploration counter ; steps_done = [0] global MIN_MEMORY i = 0 while len(memory) < MIN_MEMORY : bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) ) i+=1 cumul_reward = 0.0 last_screen = get_screen_reset(env,preprocess=preprocess) current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess ) state = current_screen - last_screen episode_buffer = [] init_goal = state init_goal = torch.zeros(current_screen.size()) ''' if len(memory) : init_goal = sample_init_goal(memory) ''' showcount = 0 for t in count() : model.eval() #HER : stategoal = torch.cat( [state,init_goal], dim=1) #action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) action = select_action(model,stategoal,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess) cumul_reward += reward if rendering : if showcount >= 10 : showcount = 0 render(current_screen)#env.render() else : showcount +=1 if not done : next_state = current_screen -last_screen else : next_state = torch.ones(current_screen.size()) episode_buffer.append( EXP(state,action,next_state,reward,done) ) state = next_state if done : log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {}'.format(cumul_reward) bashlogger.info(log) break #Let us add this episode_buffer to the replayBuffer : for itexp in range(len(episode_buffer)) : el = episode_buffer[itexp] #HER : reward with init_goal HERreward = reward_function(el.state,init_goal) reward = HERreward+el.reward #store this transition with init_goal : init_el = EXP( state=torch.cat( [el.state, init_goal], dim=1), action=el.action, next_state=torch.cat( [el.next_state, init_goal], dim=1), reward=reward, done=el.done ) init_priority = memory.priority( torch.abs(init_el.reward).numpy() ) memory.add(init_el,init_priority) #store for multiple goals : #1: sample new goal : goals = [] for j in range(k) : goal = None if strategy == 'final' : goal = sample_goal(episode_buffer, strategy=strategy) elif strategy == 'future' : # watch out for the empty set... index = min(len(episode_buffer)-3,itexp) goal = sample_goal(episode_buffer, strategy=index) goals.append(goal) #For each goal ... for goal in goals : #2: .. compute reward : goalreward = reward_function(el.state,goal)+el.reward #3: ... store this transistion with goal : goalel = EXP( state=torch.cat( [el.state, goal], dim=1), action=el.action, next_state = torch.cat( [el.next_state, goal], dim=1), reward = goalreward, done=el.done ) init_priority = memory.priority( torch.abs(goalel.reward).numpy() ) memory.add(goalel,init_priority) del el del goals del episode_buffer bashlogger.info('MEMORY : initialization : complete.') except Exception as e : bashlogger.exception(e)
def trainIN(self,index,model,env,memory,optimizers,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,HER={'use_her':True,'k':4,'strategy':'future','singlegoal':False},use_cuda=True,rendering=False): try : episode_durations = [] episode_reward = [] episode_qsa = [] episode_grad_actor = [] episode_loss = [] #accumulateMemory(memory,env,models,preprocess,epsstart=0.5,epsend=0.3,epsdecay=200,k=k,strategy=strategy) usehd = False if usehd : from utils.histogram import HistogramDebug hd = HistogramDebug() hd.setXlimit(-2.5,2.5) reward_scaler = 10.0 for i in range(num_episodes) : bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) ) cumul_reward = 0.0 last_state = get_state_reset(env,preprocess=preprocess) state, reward, done, info = get_state(env,env.action_space.sample(),preprocess=preprocess ) episode_buffer = [] meanfreq = 0 episode_qsa_buffer = [] episode_closs_buffer = [] episode_aloss_buffer = [] episode_grad_actor_buffer = [] action_buffer = [] #HER : sample initial goal : if HER['use_her'] : #if not HER['singlegoal'] : # init_goal = sample_init_goal(memory) #else : # init_goal = torch.zeros(state.size()) init_goal = torch.zeros(state.size()) showcount = 0 for t in count() : since = time.time() #HER : if HER['use_her'] : evalstate = torch.cat( [state,init_goal], dim=1) else : evalstate = state if i%5 == 0 : action = model.act(evalstate, exploitation=True) else : action = model.act(evalstate, exploitation=False) action_buffer.append(action ) taction = torch.from_numpy(action.astype(np.float32)) last_state = evalstate state, reward, done, info = get_state(env, action,preprocess=preprocess) reward /= reward_scaler cumul_reward += float(reward) treward = torch.from_numpy(reward.astype(np.float32)) if rendering : if showcount >= 1 : showcount = 0 #render(current_state) #plt.imshow(env.render(mode='rgb_array') ) env.render() else : showcount +=1 terminal = 0.0 if done : terminal = 1.0 tterminal = terminal*torch.ones((1)) episode_buffer.append( EXP(last_state,taction,state,treward,tterminal) ) episode_qsa_buffer.append( model.evaluate(evalstate, taction) ) #Optimize model : retloss = model.optimize(optimizer_critic=optimizers['critic'],optimizer_actor=optimizers['actor']) if retloss is not None : critic_loss,actor_loss, actor_grad = retloss episode_closs_buffer.append( np.mean(critic_loss) ) episode_aloss_buffer.append( np.mean(actor_loss) ) episode_grad_actor_buffer.append( actor_grad ) else : episode_closs_buffer.append(0) episode_aloss_buffer.append(0) episode_grad_actor_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(t+1) + f)/(t+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) if done or t> MAX_STEP: episode_durations.append(t+1) episode_reward.append(cumul_reward) meancloss = np.mean(episode_closs_buffer) meanaloss = np.mean(episode_aloss_buffer) episode_loss.append(meancloss) meanqsa = np.mean(episode_qsa_buffer) maxqsa = np.max(episode_qsa_buffer) episode_qsa.append( meanqsa) meanactorgrad = np.max(episode_grad_actor_buffer) episode_grad_actor.append( meanactorgrad) meanaction = np.mean(action_buffer) sigmaaction = np.std(action_buffer) action_buffer = np.array(action_buffer).reshape((-1)) if usehd : hd.append(np.array(action_buffer) ) log = 'Episode duration : {}'.format(t+1) +'---' +' Action : mu:{:.4f} sig:{:.4f} // Reward : {} // Mean C/A Losses : {:.4f}/{:.4f} // Mean/MaxQsa : {:.4f}/{:.4f} // Mean Actor Grad : {:.8f}'.format(meanaction,sigmaaction,cumul_reward,meancloss,meanaloss,meanqsa,maxqsa,meanactorgrad) +'---'+' {}Hz'.format(meanfreq) if i%5 == 0: log = 'EVAL :: ' + log bashlogger.info(log) if logger is not None : new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'critic loss':[meancloss],'actor loss':[meanaloss],'max qsa':[maxqsa],'mean qsa':[meanqsa],'mean action':[meanaction]} logger.append(new) if path is not None : model.save(path+'.save') bashlogger.info('Model saved : {}'.format(path) ) break #compute returns : episode_buffer = model.compute_returns(episode_buffer) #Let us add this episode_buffer to the replayBuffer : if HER['use_her'] : for itexp in range(len(episode_buffer)) : el = episode_buffer[itexp] #HER : reward with init_goal HERreward = reward_function(el.state,init_goal) reward = HERreward+el.reward #store this transition with init_goal : init_el = EXP( state=torch.cat( [el.state, init_goal], dim=1), action=el.action, next_state=torch.cat( [el.next_state, init_goal], dim=1), reward=reward, done=el.done ) init_priority = memory.priority( torch.abs(init_el.reward).numpy() ) memory.add(init_el,init_priority) #store for multiple goals : #1: sample new goal : goals = [] for j in range(HER['k']) : goal = None if HER['strategy'] == 'final' : goal = sample_goal(episode_buffer, strategy=HER['strategy']) elif HER['strategy'] == 'future' : # watch out for the empty set... index = min(len(episode_buffer)-3,itexp) goal = sample_goal(episode_buffer, strategy=index) goals.append(goal) #For each goal ... for goal in goals : #2: .. compute reward : goalreward = reward_function(el.state,goal)+el.reward #3: ... store this transistion with goal : goalel = EXP( state=torch.cat( [el.state, goal], dim=1), action=el.action, next_state = torch.cat( [el.next_state, goal], dim=1), reward = goalreward, done=el.done ) init_priority = memory.priority( torch.abs(goalel.reward).numpy() ) memory.add(goalel,init_priority) del el del goals else : if isinstance( memory, PrioritizedReplayBuffer) : for el in episode_buffer : #store this transition init_priority = memory.priority( torch.abs(el.reward).numpy() ) memory.add(el,init_priority) else : for el in episode_buffer : #store this transition memory.add(el) del episode_buffer # check memory consumption and clear memory gc.collect() bashlogger.info('Learning complete.') if path is not None : savemodel(model,path+'.save') bashlogger.info('Model saved : {}'.format(path) ) env.close() except Exception as e : bashlogger.exception(e)
def train(model, env, memory, optimizer, preprocess=T.ToTensor(), path=None, frompath=None, num_episodes=1000, epsend=0.05, epsstart=0.9, epsdecay=200): episode_durations = [] episode_reward = [] global rendering #exploration counter ; global steps_done steps_done = 0 if frompath is not None: loadmodel(model, frompath) print('Model loaded: {}'.format(frompath)) for i in range(num_episodes): print('Episode : {} : memory : {}/{}'.format(i, len(memory), memory.capacity)) cumul_reward = 0.0 last_screen = get_screen_reset(env, preprocess=preprocess) current_screen, reward, done, info = get_screen( env, env.action_space.sample(), preprocess=preprocess) state = current_screen - last_screen episode_buffer = [] for t in count(): action = select_action(model, state, epsend=epsend, epsstart=epsstart, epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen( env, action[0, 0], preprocess=preprocess) cumul_reward += reward if rendering: env.render() reward = Tensor([reward]) if not done: next_state = current_screen - last_screen else: next_state = torch.zeros(current_screen.size()) episode_buffer.append(EXP(state, action, next_state, reward, done)) state = next_state since = time.time() optimize_model(model, memory, optimizer) elt = time.time() - since f = 1.0 / elt #print('{} Hz ; {} seconds.'.format(f,elt) ) if done: episode_durations.append(t + 1) episode_reward.append(cumul_reward) print('Epoch duration : {}'.format(t + 1)) print('Cumulative Reward : {}'.format(cumul_reward)) if path is not None: savemodel(model, path) print('Model saved : {}'.format(path)) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for el in episode_buffer: init_priority = memory.priority(torch.abs(el.reward).numpy()) memory.add(el, init_priority) del episode_buffer print('Complete') if path is not None: savemodel(model, path) print('Model saved : {}'.format(path)) env.close() plt.ioff() plt.show()
def train(model, env, memory, optimizer, logger=None, preprocess=T.ToTensor(), path=None, frompath=None, num_episodes=1000, epsend=0.05, epsstart=0.9, epsdecay=200): try: episode_durations = [] episode_reward = [] episode_loss = [] global rendering #exploration counter ; steps_done = [0] accumulateMemory(memory, env, model, preprocess, epsstart=0.99, epsend=0.9, epsdecay=200) for i in range(num_episodes): bashlogger.info('Episode : {} : memory : {}/{}'.format( i, len(memory), memory.capacity)) cumul_reward = 0.0 last_screen = get_screen_reset(env, preprocess=preprocess) current_screen, reward, done, info = get_screen( env, env.action_space.sample(), preprocess=preprocess) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] for t in count(): model.eval() action = select_action(model, state, steps_done=steps_done, epsend=epsend, epsstart=epsstart, epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen( env, action[0, 0], preprocess=preprocess) cumul_reward += reward if rendering: env.render() reward = Tensor([reward]) if not done: next_state = current_screen - last_screen else: next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state, action, next_state, reward, done)) state = next_state since = time.time() lossnp = optimize_model(model, memory, optimizer) if lossnp is not None: episode_loss_buffer.append(np.mean(lossnp)) else: episode_loss_buffer.append(0) elt = time.time() - since f = 1.0 / elt meanfreq = (meanfreq * (t + 1) + f) / (t + 2) #print('{} Hz ; {} seconds.'.format(f,elt) ) if done: episode_durations.append(t + 1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) log = 'Episode duration : {}'.format( t + 1) + '---' + ' Reward : {} // Mean Loss : {}'.format( cumul_reward, meanloss) + '---' + ' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None: new = { 'episodes': [i], 'duration': [t + 1], 'reward': [cumul_reward], 'mean frequency': [meanfreq], 'loss': [meanloss] } logger.append(new) if path is not None: savemodel(model, path + '.save') bashlogger.info('Model saved : {}'.format(path)) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for el in episode_buffer: init_priority = memory.priority(torch.abs(el.reward).numpy()) memory.add(el, init_priority) del episode_buffer bashlogger.info('Complete') if path is not None: savemodel(model, path) bashlogger.info('Model saved : {}'.format(path)) env.close() except Exception as e: bashlogger.exception(e)
def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10): try : episode_durations = [] episode_reward = [] episode_loss = [] global rendering global use_cuda global MAX_STEPS #exploration counter ; steps_done = [0] #Double Network initialization : savemodel(model,path+'.save') #model_ = DuelingDQN(model.nbr_actions) model_ = copy.deepcopy(model) hard_update(model_,model) model_.eval() if use_cuda : model_ = model_.cuda() for i in range(num_episodes) : bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) ) cumul_reward = 0.0 last_screen = get_screen_reset(env,preprocess=preprocess) current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess ) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] episode_qsa_buffer = [] showcount = 0 for t in count() : action,qsa = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) episode_qsa_buffer.append(qsa) last_screen = current_screen current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess) cumul_reward += reward if rendering : if showcount >= 10 : showcount = 0 render(current_screen)#env.render() else : showcount +=1 reward = FloatTensor([reward]) if not done : next_state = current_screen -last_screen else : next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state,action,next_state,reward,done) ) state = next_state # OPTIMIZE MODEL : since = time.time() lossnp = self.optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) # SOFT UPDATE : soft_update(model_,model,self.TAU) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(t+1) + f)/(t+2) if done or t > MAX_STEPS: self.from_worker2model() ''' nbrTrain = 2 for tr in range(nbrTrain) : since = time.time() lossnp = optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(tr+1) + f)/(tr+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) ''' episode_durations.append(t+1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) meanqsa = np.mean(episode_qsa_buffer) log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {} // QSA : {}'.format(cumul_reward,meanloss,meanqsa) +'---'+' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None : new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]} logger.append(new) if path is not None : # SAVE THE MAIN MODEL : self.model.lock() savemodel(self.model,path+'.save') self.model.unlock() bashlogger.info('Model saved : {}'.format(path) ) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for el in episode_buffer : init_priority = memory.priority( torch.abs(el.reward).cpu().numpy() ) memory.add(el,init_priority) del episode_buffer bashlogger.info('Complete') if path is not None : savemodel(model,path+'.save') bashlogger.info('Model saved : {}'.format(path) ) env.close() except Exception as e : bashlogger.exception(e)