def test_learn_play(d = 6, num_layers = 2, num_units = 100, eps = 0.5, iters = 10000, draw=False, tabular = True, batch=False, batch_epochs=10, num_episodes = 10, episode_length = 100): iters_per_value = 1 if iters <= 10 else int(iters / 10.0) scores = [] def interact(q, iter=0): if iter % iters_per_value == 0: scores.append((iter, evaluate(game, num_episodes, episode_length, lambda s: greedy(q, s))[0])) print('score', scores[-1], flush=True) game = No_Exit(d) if tabular: q = TabularQ(game.states, game.actions) else: q = NNQ(game.states, game.actions, game.state2vec, num_layers, num_units, epochs=batch_epochs if batch else 1) if batch: qf = Q_learn_batch(game, q, iters=iters, episode_length = 100, n_episodes=10, interactive_fn=interact) else: qf = Q_learn(game, q, iters=iters, interactive_fn=interact) if scores: print('String to upload (incude quotes): "%s"'%toHex(pickle.dumps([tabular, batch, scores], 0).decode())) # Plot learning curve plot_points(np.array([s[0] for s in scores]), np.array([s[1] for s in scores])) for i in range(num_episodes): reward, _, animation = sim_episode(game, (episode_length if d > 5 else episode_length/2), lambda s: greedy(qf, s), draw=draw) print('Reward', reward) return animation
def test_solve_play(d=6, draw=False, num_episodes=10, episode_length=100): game = TempSim() qf = value_iteration(game, TabularQ(game.states, game.actions)) for i in range(num_episodes): reward, _ = sim_episode(game, (episode_length if d > 5 else episode_length / 2), lambda s: greedy(qf, s), draw=draw) print('Reward', reward)
def test_solve_play(d = 5, draw=False, num_episodes = 10, episode_length = 100): game = No_Exit(d) qf = value_iteration(game , TabularQ(game.states, game.actions)) for i in range(num_episodes): reward, _, animation = sim_episode(game, (episode_length if d > 3 else episode_length/2), lambda s: greedy(qf, s), draw=draw) print('Reward', reward) return animation
def emulate(game, q, episode_length, num_episodes=1): for i in range(num_episodes): reward, _ = sim_episode(game, episode_length, lambda s: greedy(q, s), interactive_fn=print) print('Reward', reward)
def interact(q, iter=0): if iter % iters_per_value == 0: scores.append((iter, evaluate(game, num_episodes, episode_length, lambda s: greedy(q, s))[0])) print('score', scores[-1])