Esempio n. 1
0


# init game
done = False
state = env.reset()
total_reward = 0
step = 1

#while not done:
while True:    
    current_state = torch.FloatTensor(state).unsqueeze(0)
    if USE_CUDA:
        current_state = current_state.cuda()
   
    action = actor_critic.act(current_state)
    next_state, reward, done, _ = env.step(action.data[0,0])
    total_reward += reward
    state = next_state

    _, value = actor_critic(current_state)
    value = value.data.cpu().numpy()    
    
    image = torch.FloatTensor(state).permute(1,2,0).cpu().numpy()
    displayImage(image, step, total_reward, value)
    step += 1
    if done:
        state = env.reset()
        step = 1
        total_reward = 0
Esempio n. 2
0
    episode_rewards2 = torch.zeros(num_envs, 1)
    final_rewards2   = torch.zeros(num_envs, 1)    

    timer.update(time.time())
    swich_variable = 0

    ##### training observation ######
    traiobsenv = makeTrainingObservation()
    trainobs = traiobsenv.reset()
    #################################

    for i_update in range(num_frames):
        for step in range(num_steps):                             
            
            # actor1 acts in all parallel envs
            action_p1 = agent1.act(make_cuda(state)).squeeze(1).cpu().numpy() 
            
            # actor2 acts in all parallel envs
            action_p2 = agent2.act(make_cuda(state)).squeeze(1).cpu().numpy() 

            # separate actions
            action_tuples = []              
            for i in range(num_envs):
                actions = []
                actions.append(action_p1[i])    # player1
                actions.append(action_p2[i])    # player2
                action_tuples.append(actions)
            
            next_observation, reward, finished, _ = envs.step(action_tuples)    # pass actions to environments

            # separate rewards
Esempio n. 3
0
    state = envs.reset() 
    
    state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards   = torch.zeros(num_envs, 1)    

    timer.update(time.time())

    for i_update in range(num_frames):

        for step in range(num_steps):                 
                       
            action = actor_critic.act(make_cuda(state))
                               
            next_state, reward, finished, _ = envs.step(action.squeeze(1).cpu().data.numpy())                        

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            finished_masks = torch.FloatTensor(1-np.array(finished)).unsqueeze(1)                                                       

            final_rewards *= finished_masks
            final_rewards += (1-finished_masks) * episode_rewards                       
                                                              
            episode_rewards *= finished_masks
                                                                       
            finished_masks = make_cuda(finished_masks)

            state = torch.FloatTensor(np.float32(next_state))
    
    state = torch.FloatTensor(np.float32(state))
    if USE_CUDA:
        state = state.cuda()

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards   = torch.zeros(num_envs, 1)    

    timer.update(time.time())

    for i_update in range(num_frames):

        for step in range(num_steps):                             
            action = actor_critic.act(state)
                               
            next_state, reward, finished, _ = envs.step(action.squeeze(1).cpu().data.numpy())
               
            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            finished_masks = torch.FloatTensor(1-np.array(finished)).unsqueeze(1)                                                       

            final_rewards *= finished_masks
            final_rewards += (1-finished_masks) * episode_rewards                       
                                                              
            episode_rewards *= finished_masks
            state = torch.FloatTensor(np.float32(next_state))                                               
            
            if USE_CUDA:
                finished_masks = finished_masks.cuda()
Esempio n. 5
0
    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    if USE_CUDA:
        actor_critic = actor_critic.cuda()
        rollout.cuda()

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    writer = new_writer(LABEL, arg)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            action = actor_critic.act(state.cuda())

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = process_reward(reward, MODE_REWARDS[mode])
            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if USE_CUDA:
                masks = masks.cuda()
Esempio n. 6
0
    pretrained_dict = torch.load(agentPath, map_location='cpu')
    actor_critic.load_state_dict(pretrained_dict)

    actor_critic = make_cuda(actor_critic)

    # init game
    done = False
    state = env.reset()
    
    step = 1
    total_reward = 0
    
    while True:    
        current_state = torch.FloatTensor(state)
        
        action = actor_critic.act(make_cuda(current_state.unsqueeze(0)))        
        
        next_state, reward, done, _ = env.step(action.data[0][0])                
        total_reward += reward
        state = next_state

        _, value = actor_critic(make_cuda(current_state.unsqueeze(0)))        
        value = value.data.cpu().numpy()    
        
        image = torch.FloatTensor(upscale(state)).permute(1,2,0).cpu().numpy()                
                        
        displayImage(image, step, total_reward, value)
        step += 1
        if done:
            total_reward = 0
            state = env.reset()