def learn_model_fn(model, timesteps, save, period):
     return learn(model,
                  model_name=model_name,
                  model_path=model_path,
                  timesteps=timesteps,
                  save=save,
                  period=period)
 def learn_model_fn(model, timesteps, save, period):
     save_thresh = 19.2 if args.selfplay else None
     return learn(model,
                  model_name=model_name,
                  model_path=model_path,
                  timesteps=timesteps,
                  save=save,
                  period=period,
                  save_thresh=save_thresh)
Beispiel #3
0
            optim_param=[alpha],
            loss_function=nn.MSELoss(),
            tau=1,
            device=device)

buffer = QBuffer(memory_size, batch_size, device)
learning_policy = EpsDecay(eps_start, eps_min, eps_decay, env.action_space.n)
playing_policy = Greedy()
agent = Agent(model=model,
              buffer=buffer,
              learn_every=4,
              update_every=4,
              policy_learning=learning_policy,
              policy_playing=playing_policy)

scores = util.learn(env, goal_size, average_goal, agent, max_step, nb_epi_max,
                    gamma, learning_policy)

print(len(buffer))
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

for i in range(10):
    state = env.reset()
    score = 0
    env.render()
    for j in range(max_step):
        action = agent.act(state)