예제 #1
0
            all_losses1.append(loss_p1.item())
            all_rewards2.append(final_rewards2.mean().item())
            all_losses2.append(loss_p2.item())
            print('step %s' % (i_update))
            print('reward Player1: %s' % np.mean(all_rewards1[-10:]))                        
            print('reward Player2: %s' % np.mean(all_rewards2[-10:]))            
            print('loss Player1 %s' % all_losses1[-1])
            print('loss Player2 %s' % all_losses2[-1])
            print("---------------------------")
            
            timer.update(time.time())            
            timediff = timer.getTimeDiff()
            total_time = timer.getTotalTime()
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            logger.printDayFormat("runntime last epochs: ", timediff)
            logger.printDayFormat("total runtime: ", total_time)
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)           
            print("######## {0} ########".format(sys.argv[1]))
        rollout1.after_update() # player1
        rollout2.after_update() # player2

        if i_update % 1000 == 0 and i_update > 0:
            logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
            logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
            logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable))    
            logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
            logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
            logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable))
            swich_variable += 1
            swich_variable %= 2
예제 #2
0
        value_loss = advantages.pow(2).mean()        
        action_loss = -(advantages.data * action_log_probs).mean()

        optimizer.zero_grad()        
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        
        loss.backward()
        nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm)
        optimizer.step()
    
        if i_update % 100 == 0:            
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())
            timer.update(time.time())
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            
            print('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:])))            
            print('loss %s' % all_losses[-1])
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)
            print("######## AC_Pacman_{0} ########".format(mode))                        
        rollout.after_update()
        
    logger.log(all_rewards, "Data/", "all_rewards_{0}.txt".format(mode))  
    logger.log(all_losses, "Data/", "all_losses_{0}.txt".format(mode))      
    logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_{0}".format(mode))