def monte_carlo_demo(): np.random.seed(101) env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo(0.5) with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi) np.random.seed(101) agent2 = TableAgent(env) pi_algo = PolicyIteration() with timer('Timer PolicyIter'): pi_algo.policy_iteration(agent2) print('return_pi={}'.format(eval_game(env, agent2))) print(agent2.pi) np.random.seed(101) agent3 = ModelFreeAgent(env) mc = SARSA(0.5) with timer('Timer Monte Carlo Iter'): mc.sarsa(agent3, env) print('return_pi={}'.format(eval_game(env, agent3))) print(agent3.pi)
def monte_carlo_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent2 = TableAgent(env) pi_algo = PolicyIteration() with timer('Timer PolicyIter'): pi_algo.policy_iteration(agent2) print('PolicyIteration:return_pi={}'.format(eval_game(env, agent2))) print(agent2.pi) np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent3 = TableAgent(env) vi_algo = ValueIteration() vi_algo.value_iteration(agent3) print('ValueIteration:return_pi={}'.format(eval_game(env, agent3))) print(agent3.pi) np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo() with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('MonteCarlo:return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def test_easy(): np.random.seed(0) sum_opt = 0 sum_0 = 0 sum_1 = 0 env = SnakeEnv(0, [3, 6]) for i in range(10000): sum_opt += eval_game(env, policy_ref) sum_0 += eval_game(env, policy_0) sum_1 += eval_game(env, policy_1) print('opt avg={}'.format(sum_opt / 10000.0)) print('0 avg={}'.format(sum_0 / 10000.0)) print('1 avg={}'.format(sum_1 / 10000.0))
def policy_iteration_demo2(): env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) agent.pi[:] = 0 print('return3={}'.format(eval_game(env, agent))) agent.pi[:] = 1 print('return6={}'.format(eval_game(env, agent))) agent.pi[97:100] = 0 print('return_ensemble={}'.format(eval_game(env, agent))) pi_algo = PolicyIteration() pi_algo.policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def first_easy(): sum_opt = 0 sum_0 = 0 sum_1 = 0 env = SnakeEnv(0, [3, 6]) countNum = 10000 for i in range(countNum): sum_opt += eval_game(env, policy_ref) sum_0 += eval_game(env, policy_0) sum_1 += eval_game(env, policy_1) print('policy_ref avg={}'.format(sum_opt / countNum)) print('policy_0 avg={}'.format(sum_0 / countNum)) print('policy_1 avg={}'.format(sum_1 / countNum))
def monte_carlo_demo(): env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo() with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi) agent2 = TableAgent(env) pi_algo = PolicyIteration() with timer('Timer PolicyIter'): pi_algo.policy_iteration(agent2) print('return_pi={}'.format(eval_game(env, agent2))) print(agent2.pi)
def policy_iteration_demo1(): env = SnakeEnv(0, [3, 6]) agent = TableAgent(env) pi_algo = PolicyIteration() pi_algo.policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def policy_iteration_demo1(): env = SnakeEnv(0, [3, 6]) #0代表不考虑梯子 agent = TableAgent(env) #表agent pi_algo = PolicyIteration() #策略迭代模型 pi_algo.policy_iteration(agent) #获得新一时刻的状态值函数 print 'return_pi={}'.format(eval_game(env, agent)) print agent.pi
def value_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) vi_algo = ValueIteration() vi_algo.value_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def policy_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) pi_algo = PolicyIterationWithTimer() pi_algo.policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def generalized_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) pi_algo = GeneralizedPolicyIteration() with timer('Timer GeneralizedIter'): pi_algo.generalized_policy_iteration(agent) print('return_pi={}'.format(eval_game(env, agent)))
def monte_carlo_demo2(): env = SnakeEnv(10, [3, 6]) agent = ModelFreeAgent(env) mc = MonteCarlo(0.5) with timer('Timer Monte Carlo Iter'): mc.monte_carlo_opt(agent, env) print('return_pi={}'.format(eval_game(env, agent))) print(agent.pi)
def value_iteration_demo(): np.random.seed(0) env = SnakeEnv(10, [3, 6]) agent = TableAgent(env) pi_algo = ValueIteration() with timer('Timer ValueIter'): pi_algo.value_iteration(agent) print 'return_pi={}'.format(eval_game(env, agent))