def evaluate(agent: DQNAgent, n_epoch=10, render=False):
    """
    evaluate the agent
    :param agent: agent to be evaluated
    :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is
    :param render: if you want to visualize the evaluation
    :return: score of the evaluation
    """
    env = gym.make('LunarLander-v2')
    score = []
    for e in range(n_epoch):
        done = False
        state = env.reset()
        epoch_reward = 0
        step = 1
        while not done and not (step % 1000 == 0):
            step += 1
            if render:
                env.render()
            action_dist = agent.get_q(preprocess_state(state))
            action = agent.select_action(action_dist)
            next_state, reward, done, info = env.step(action)
            epoch_reward += reward
            state = next_state
        print("episode {}/{} , reward: {}".format(e, n_epoch, epoch_reward))
        score.append(epoch_reward)
    score = np.mean(score)
    return score
def learn_on_mini_batch(e,
                        actor: Actor,
                        critic: DQNAgent,
                        critic_target: DQNAgent,
                        exp_replay: ExperienceReplay,
                        config=dense_config):
    batch_size = FLAGS.batch_size
    mini_batch = exp_replay.getMiniBatch(batch_size)
    state_batch, action_batch, reward_batch, dones_batch, next_state_batch = [], [], [], [], []
    for exp in mini_batch:
        state_batch.append(exp.state)
        action_batch.append(exp.action)
        reward_batch.append(exp.reward)
        dones_batch.append(exp.done)
        if dones_batch[-1]:
            next_state_batch.append(
                exp.state
            )  # this is just to prevent nope, the terminal states are masked anyway
        else:
            next_state_batch.append(exp.next_state)
    Actor_Y_Batch = np.zeros((mini_batch.__len__(), actor.output_size[-1]))
    Critic_Y_Batch = np.zeros((mini_batch.__len__(), 1))
    critic_batch_output_for_state = critic_target.get_q(state_batch)
    critic_batch_output_for_next_state = critic_target.get_q(next_state_batch)
    for i, reward in enumerate(reward_batch):  # iteration over batch_size
        target = reward + (
            critic.gamma * np.max(critic_batch_output_for_next_state[i])) * (
                1 - dones_batch[i])  # create target_value --> scalar
        Critic_Y_Batch[
            i] = target  # target_batch (target.get_q(state_batch))[i][action[i]) = target
        Actor_Y_Batch[i][
            action_batch[i]] = target - critic_batch_output_for_state[i]
        # Q(s,a) - V(s) = Advantage for stability
    critic.learn(target_batch=Critic_Y_Batch,
                 learning_rate=config.learning_rate_schedule_critic(e),
                 input=state_batch)
    actor.learn(target_batch=Actor_Y_Batch,
                learning_rate=config.learning_rate_schedule_actor(e),
                input=state_batch)
Exemple #3
0
def evaluate(agent: DQNAgent, n_epoch=10, render=False, verbose=False, record=False, video_path=None):
    """
    evaluate the agent
    :param agent: agent to be evaluated
    :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is
    :param render: if you want to visualize the evaluation
    :return: score of the evaluation
    """
    env = gym.make('MsPacmanDeterministic-v4')
    if record:
        video_save_location = "./vid" if not video_path else video_path
        env = gym.wrappers.Monitor(env, video_save_location, video_callable=lambda episode_id: True, force=True)
    final_score = []
    for e in range(n_epoch):
        state = init_state()
        observation = env.reset()
        observation = process_observation(observation)
        done = False
        epoch_reward = 0.0
        while not done:
            state = append_frame(state, observation)
            if render:
                env.render()
            q_values = agent.get_q(state=np.expand_dims(state, axis=0))
            action = agent.select_action(qValues=q_values, explore=False)
            next_observation, reward, done, _ = env.step(action)
            next_observation = process_observation(next_observation)
            observation = next_observation
            epoch_reward += reward
        if verbose:
            print("Episode ", e, " / {} finished with reward {}".format(n_epoch, epoch_reward))
        final_score.append(epoch_reward)
    final_score = np.mean(final_score)
    try:
        del env
    except ImportError:
        pass
    return final_score
Exemple #4
0
def evaluate(agent: DQNAgent, n_epoch=10, render=False):
    """
    evaluate the agent
    :param agent: agent to be evaluated
    :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is
    :param render: if you want to visualize the evaluation
    :return: score of the evaluation
    """
    env = gym.make("PongNoFrameskip-v4")
    env = wrap_deepmind(env, frame_stack=True)
    final_score = []
    for e in range(n_epoch):
        state = env.reset()
        state = np.asarray(state)
        done = False
        epoch_reward = 0.0
        while not done:
            if render:
                env.render()
            q_values = agent.get_q(
                state=np.reshape(state, (1, state.shape[0], state.shape[1],
                                         state.shape[2])))
            action = agent.select_action(qValues=q_values, explore=False)
            next_state, reward, done, _ = env.step(action + 1)
            # 1 for up 2 for stay 3 for down, action is from 0 to 2 so we need an offset
            next_state = np.asarray(next_state)
            state = next_state
            epoch_reward += reward
        print("Episode ", e,
              " / {} finished with reward {}".format(n_epoch, epoch_reward))
        final_score.append(epoch_reward)
    final_score = np.mean(final_score)
    try:
        del env
    except ImportError:
        pass
    return final_score
Exemple #5
0
def policy_distilliation_batch_train(exp_replay,
                                     student: DQNAgent,
                                     learning_rate=1.0e-4,
                                     config=student_config,
                                     use_per=False,
                                     e=None):
    """
    train the student on a batch of experiences
    :param student: student to be trained on a batch of experiences
    :param exp_replay: the Experience replay
    :param learning_rate: learning rate for the SGD
    :param config: config where batch size and the output size is described
    :param use_per: if True use Prioritized supervised experience replay
    :return: loss fn value
    """
    if not use_per:
        mini_batch = exp_replay.getMiniBatch(batch_size=config.batch_size)
        weights, indexes = np.ones(
            np.shape(mini_batch)[0], config.output_size), None
    else:
        mini_batch, weights, indexes = exp_replay.getMiniBatch(
            batch_size=config.batch_size,
            beta=config.beta_schedule(beta0=config.BETA0_PER,
                                      e=e,
                                      n_epoch=config.n_epochs))
    state = [exp.state for exp in mini_batch]
    target = [exp.label for exp in mini_batch]
    target = np.squeeze(target)
    loss, td_errors = student.learn(target_batch=target,
                                    input=state,
                                    learning_rate=learning_rate,
                                    weights=weights)

    if use_per:
        td_errors_maximum = np.max(td_errors)
        td_errors *= np.ones_like(td_errors) * (td_errors_maximum**-1)
        action_batch = [exp.action for exp in mini_batch]
        new_priority = np.abs(
            td_errors
        ) + config.EPS_PER  # we add epsilon so that every transaction has a chance
        new_priority = [
            priority[action_batch[i]]
            for i, priority in enumerate(new_priority)
        ]
        exp_replay.update_priorities(indexes=indexes, priorities=new_priority)

    return loss
Exemple #6
0
plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
           interpolation='none')

BATCH_SIZE = 32
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

n_actions = env.action_space.n

policyNet = DQNAgent(screen_height, screen_width, n_actions).to(device)
targetNet = DQNAgent(screen_height, screen_width, n_actions).to(device)

targetNet.load_state_dict(policyNet.state_dict(
))  # Use the parameters of policyNet to evaluate targetNet
targetNet.eval()

optimizer = optim.RMSprop(policyNet.parameters())
memory = ReplayMemory(10000)

steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
from model import DQNAgent
import gym
import numpy as np

# Number of games for the agent to train on
episodes = 1000

# initialize gym environment and the agent
env = gym.make('CartPole-v0')

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)
agent.build_model()

# Iterate the game
for e in range(episodes):
    # reset state in the beginning of each game
    state = env.reset()
    state = np.reshape(state, [1, 4])
    # time_t represents each frame of the game
    # Our goal is to keep the pole upright as long as possible until score of 500
    # the more time_t the more score
    for time_t in range(500):
        # turn this on if you want to render
        env.render()
        # Decide action
        action = agent.act(state)
        # Advance the game to the next frame based on the action.
        # Reward is 1 for every frame the pole survived
Exemple #8
0
start = time.time()
writer = SummaryWriter()

# Hyper-parameters
BATCH_SIZE = 512
MEMORY_SIZE = 5000
LR = 0.001
test_interval = 1000
test_episodes = 100
TIMESTEPS = 10000
EPSILON_ENDT = 3000

env = gym.make('CartPole-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = DQNAgent(d_actions=env.action_space.n, device=device, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, lr=LR,
                 epsilon_endt=EPSILON_ENDT)
agent.policy_net = MLPPolicy(d_state=env.observation_space.shape[0], d_hidden=20,
                             d_action=env.action_space.n).to(device)

init = time.time()
print("Init time {}".format(init-start))

num_episode = 0
episode_t = 0

state = env.reset()
state = torch.from_numpy(state).unsqueeze_(0).to(device=device, dtype=torch.float)
while agent.time_step < TIMESTEPS:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action.item())
    episode_t += 1
Exemple #9
0
from model import DQNAgent
import time
from tensorboardX import SummaryWriter

EPSILON_START = 1.0
EPSILON_FINAL = 0.1
EPSILON_DECAY = 250000
EPISODES = 5000

epsilon_by_frame = lambda step_idx: EPSILON_FINAL + (
    EPSILON_START - EPSILON_FINAL) * math.exp(-1. * step_idx / EPSILON_DECAY)
writer = SummaryWriter(comment='DQN')

num_frames = 0
env = gym.make('Riverraid-v0')
agent = DQNAgent(env)
is_render = False
for i_episode in range(EPISODES):
    score = 0
    observation = env.reset()
    observation = WarpFrame(observation)
    observation = np.stack([observation] * 4, axis=0)
    done = False
    #is_render = i_episode % 10 == 0
    t = time.time()
    loss = []
    while not done:
        if is_render:
            env.render()
        #print(observation.shape)
        """
Exemple #10
0
# In[12]:

output_dir = 'model/cartpole'

# In[13]:

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# In[15]:

from model import DQNAgent

# In[16]:

agent = DQNAgent(state_size, action_size)

# In[17]:

agent.model.summary()

# In[20]:

done = False

for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    for time in range(5000):
        # env.render()
    if options.nogui:
        # if True:
        sumoBinary = checkBinary('sumo')
    else:
        sumoBinary = checkBinary('sumo-gui')
    sumoInt.routeFileGenerator()

    # Main logic
    # parameters
    episodes = 100
    batch_size = 100

    green_duration = 10
    yellow_duration = 6
    agentGenerator = DQNAgent()
    try:
        agentGenerator.load('Models/reinf_traf_control.h5')
    except:
        print('No models found')

    for e in range(episodes):
        # DNN Agent
        # Initialize DNN with random weights
        # Initialize target network with same weights as DNN Network
        #log = open('log.txt', 'a')
        step = 0
        haltTime = 0
        reward1 = 0
        reward2 = 0
        netReward = 0.9 * (reward1 - reward2)