Exemple #1
0
            print('loss Player2 %s' % all_losses2[-1])
            print("---------------------------")
            
            timer.update(time.time())            
            timediff = timer.getTimeDiff()
            total_time = timer.getTotalTime()
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            logger.printDayFormat("runntime last epochs: ", timediff)
            logger.printDayFormat("total runtime: ", total_time)
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)           
            print("######## {0} ########".format(sys.argv[1]))
        rollout1.after_update() # player1
        rollout2.after_update() # player2

        if i_update % 1000 == 0 and i_update > 0:
            logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
            logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
            logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable))    
            logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
            logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
            logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable))
            swich_variable += 1
            swich_variable %= 2

    logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
    logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
    logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable))    
    logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
    logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
    logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable))
        value_loss = advantages.pow(2).mean()        
        action_loss = -(advantages.data * action_log_probs).mean()

        optimizer.zero_grad()        
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        
        loss.backward()
        nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm)
        optimizer.step()
    
        if i_update % 100 == 0:            
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())
            timer.update(time.time())
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            
            print('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:])))            
            print('loss %s' % all_losses[-1])
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)
            print("######## AC_Pacman_{0} ########".format(mode))                        
        rollout.after_update()
        
    logger.log(all_rewards, "Data/", "all_rewards_{0}.txt".format(mode))  
    logger.log(all_losses, "Data/", "all_losses_{0}.txt".format(mode))      
    logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_{0}".format(mode))    

    

Exemple #3
0
            print("---------------------------")
            
            timer.update(time.time())            
            timediff = timer.getTimeDiff()
            total_time = timer.getTotalTime()
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            logger.printDayFormat("runntime last epochs: ", timediff)
            logger.printDayFormat("total runtime: ", total_time)
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)                       
            print("######## AC_KeyCollect ########")
        
        rollout.after_update()
        
        # snapshot of weights, data and optimzer every 1000 epochs
        if i_update % 1000 == 0 and i_update > 0:
            logger.log(all_rewards, "Data/", "all_rewards_KeyCollect.txt")            
            logger.log(all_losses, "Data/", "all_losses_KeyCollect.txt")                        
            logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_KeyCollect")
            logger.log_state_dict(optimizer.state_dict(), "Data/actor_critic_optimizer_KeyCollect")            

    # final save        
    logger.log(all_rewards, "Data/", "all_rewards_KeyCollect.txt")    
    logger.log(all_losses, "Data/", "all_losses_KeyCollect.txt")        
    logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_KeyCollect")
    logger.log_state_dict(optimizer.state_dict(), "Data/actor_critic_optimizer_KeyCollect")            




Exemple #4
0
                rollout.actions.view(-1, 1))

            values = values.view(num_steps, num_envs, 1)
            action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
            advantages = returns - values

            value_loss = advantages.pow(2).mean()
            action_loss = -(advantages.data * action_log_probs).mean()

            optimizer.zero_grad()
            loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
            loss.backward()
            nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm)
            optimizer.step()
            rollout.after_update
        torch.save(actor_critic.state_dict(), a2c_model_path)
    print('Finished training A2C')

    def get_action(state):
        if state.ndim == 4:
            state = FloatTensor(np.float32(state))
        else:
            state = FloatTensor(np.float32(state)).unsqueeze(0)
        with torch.no_grad():
            action = actor_critic.act(state)
        action = action.data.cpu().squeeze(1).numpy()
        return action

    def play_games(envs, frames):
        states = envs.reset()