def evaluate(model_path,history_num,max_episode_steps,episode_num,result_save_path):
    checkpoint = torch.load(model_path)
    qnetwork = QNetwork(*checkpoint['model_hyper'])
    qnetwork.load_state_dict(checkpoint['model']) 
    

    env = gym.make('MountainCar-v0')
    test_success_history = []
    test_reward_history = []
    for episode in range(episode_num):
        print('episode %d'%(episode))
        observation = env.reset()
        #initialize state
        state = State(history_num)
        state.init_state(observation)
        done = False
        reward_sum = 0
    
        for t in range(max_episode_steps):
            env.render()
            state.display()
            # select a action with max q value action
            action = qnetwork.decide_action(state.toTensor().view(1,-1))
            action = action.sum().item() 
            observation, reward, done, info = env.step(action)
            reward_sum = reward_sum+reward

            if done: 
                print('done')
                print(reward_sum)
                success = False
                if observation[0]>=0.5:
                    success = True
                test_success_history.append(success)
                test_reward_history.append(reward_sum)
                break

            state.update_state_by_observation(observation,action)
            


    print('- '*100)
    print('save to %s'%(result_save_path))  
    with open(result_save_path,'wb') as f:
        pkl.dump((test_success_history,test_reward_history),f)
Beispiel #2
0
    done = False
    final_transition = None
    loss_sum = 0
    reward_sum = 0

    for t in range(max_episode_steps):
        if done:
            break
        env.render()
        state.display()
        # select a action
        p = random.random()
        if p < epsilon:
            action = env.action_space.sample()
        else:
            action = qnetwork.decide_action(state.toTensor().view(1, -1))
            # to scalar and then to int
            assert list(action.shape) == [1]
            action = action.sum().item()

        observation, reward, done, info = env.step(action)

        if done:
            # -1 reward for all step
            avg_loss = loss_sum / max_episode_steps
            print('done')
            print(avg_loss)
            loss_history.append(avg_loss)
            reward_sum_history.append(reward_sum)

        old_state = copy.deepcopy(state)