Python Policy.eval Examples

Programming Language: Python

Namespace/Package Name: Policy

Class/Type: Policy

Method/Function: eval

Examples at hotexamples.com: 3

Python Policy.eval - 3 examples found. These are the top rated real world Python examples of Policy.Policy.eval extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Policy(30)

parameters(4)

load_state_dict(4)

eval(3)

__init__(2)

action(2)

to(2)

state_dict(2)

setHeight(1)

put_data(1)

resetValues(1)

sample(1)

sample_action(1)

setConfig(1)

IsValidStateAction(1)

setWidth(1)

set_parameters(1)

policyIteration(1)

toString(1)

train(1)

train_net(1)

update(1)

updateTargetNet(1)

sortedA(1)

load_reinforcement_model(1)

parse(1)

epsilonGreedy(1)

Policy_considerPolicy(1)

__str(1)

add_state(1)

buildGraph(1)

calcQ(1)

close_sess(1)

get_action_log_prob(1)

load_best_model(1)

get_state_action_probability(1)

get_state_probabilities(1)

import_all_data_and_train(1)

import_data_and_train(1)

improve_actor(1)

improve_critic(1)

insert_policy(1)

valueIteration(1)

Example #1

Show file

File: eval_model.py Project: Tolsager/intelligent_systems_exam_code

def eval(model_type=model_type, model_path=model_path):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    env = LunarLander()

    if model_type == 'policy':
        model = Policy(env.observation_dim, env.action_dim)
    elif model_type == 'dqn':
        model = Network(env.observation_dim, env.action_dim)
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    episodes = 50
    wins = 0
    frames = []
    fuel_left = []
    for i in range(episodes):
        if i % 10 == 0:
            print(f"On episode {i}")
        frame_count = 0

        env.reset()
        state = env.get_state()
        while True:
            frame_count += 1

            action = model(
                torch.tensor(state, dtype=torch.float32,
                             device=device).unsqueeze(0)).argmax()

            state, reward, done = env.step(action)

            if done:
                if env.won:
                    wins += 1
                    frames.append(frame_count)
                    fuel_left.append(env.rocket.fuel)
                break
        env.close()

    if wins > 0:
        print(f"wins: {wins}")
        print(f"mean frames on wins {np.mean(frames)}")
        print(f"std frames on wins {np.std(frames, ddof=1)}")
        print(f"min frames on wins {np.min(frames)}")
        print(f"max frames on wins {np.max(frames)}")

        print(f"mean fuel on wins {np.mean(fuel_left)}")
        print(f"std fuel on wins {np.std(fuel_left, ddof=1)}")
        print(f"min fuel on wins {np.min(fuel_left)}")
        print(f"max fuel on wins {np.max(fuel_left)}")
    else:
        print("The model had 0 wins. Statistics can't be calculated")

Example #2

Show file

env = LunarLander()
env.reset()
exit_program = False

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

if model_type == 'policy':
    model = Policy(env.observation_dim, env.action_dim)
elif model_type == 'dqn':
    model = Network(env.observation_dim, env.action_dim)
model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()
state = env.get_state()

while not exit_program:
    env.render()
    action = model(
        torch.tensor(state, dtype=torch.float32,
                     device=device).unsqueeze(0)).argmax()

    state, reward, done = env.step(action)

    # Process game events
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            exit_program = True
        if event.type == pygame.KEYDOWN:

Example #3

Show file

File: Learner.py Project: APM150/CartPole_v0

class Learner:
    def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
        self.FILE = FILE
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = Policy().to(self.device)
        self.policy.load_state_dict(torch.load(self.FILE))
        self.policy.eval()
        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=self.learning_rate)

    def simulate(self, episode: int, policyPercent: float, show=False):
        """
        Simulate the cartpole process
        :param episode: number of episode want to simulate, how many percentage of policy want to use
        :return: list of ([trajectory of actions], [trajectory of observation], totalReward)
        """
        env = gym.make('CartPole-v0')
        result = []
        for i_episode in range(episode):
            actions = []
            observations = []
            totalReward = 500  # if not failed
            observation = env.reset()
            for t in range(500):
                if show: env.render()
                observationTensor = torch.from_numpy(
                    observation)  # convert from numpy to tensor
                observationTensor = torch.tensor(observationTensor,
                                                 dtype=torch.float32)
                observationTensor = observationTensor.to(self.device)
                observations.append(observation.tolist())
                if random.random(
                ) <= policyPercent:  # policy mix with random choice
                    with torch.no_grad():
                        action = torch.max(self.policy(observationTensor),
                                           0)[1].item()  # 0 or 1
                else:
                    action = random.randint(0, 1)
                actions.append(action)
                observation, reward, done, info = env.step(action)
                if done:
                    totalReward = t + 1
                    # print(f"Episode finished after {t + 1} timesteps")
                    break
            result.append((actions, observations, totalReward))
        env.close()
        return result

    def trainPolicy(self, episodes, policyPercent=0.8):
        """ Train the policy """
        # First play serval times to determine the average reward.
        trajectoriesForAvgRwd = self.simulate(20, 1)
        averageReward = sum([i[2] for i in trajectoriesForAvgRwd
                             ]) / len(trajectoriesForAvgRwd)
        print(averageReward)

        trajectoriesForTrain = self.simulate(episodes, policyPercent)
        for trainTrajectory in trajectoriesForTrain:
            if trainTrajectory[2] > averageReward:
                # forward
                predictAction = self.policy(
                    torch.tensor(trainTrajectory[1]).to(self.device))
                loss = self.criterion(
                    predictAction,
                    torch.tensor(trainTrajectory[0]).to(self.device))

                # backwards
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        torch.save(self.policy.state_dict(), self.FILE)