def main(unused_argv): begin = time.time() env = Go() agents = [agent.Random_Rollout_MCTS_Agent(n_playout=100), agent.RandomAgent(1)] ret = [] for ep in range(NUM_EPISODES): time_step = env.reset() print('start ep: %d'%ep) while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, env) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. agents[0].step(time_step, env) agents[1].step(time_step) ret.append(time_step.rewards[0]) print('end') print(np.mean(ret)) # print(ret) print('Time elapsed:', time.time()-begin)
def init_agents(sess, info_state_size, num_actions, hidden_layers_sizes, **kwargs): agents = [ DQN(sess, 0, info_state_size, num_actions, hidden_layers_sizes, **kwargs), agent.RandomAgent(1) ] sess.run(tf.global_variables_initializer()) return agents
def init_agents(sess, info_state_size, num_actions, cnn_parameters, hidden_layers_sizes, **kwargs): if use_dqn(): Algorithm = DQN(sess, 0, info_state_size**0.5, num_actions, cnn_parameters, hidden_layers_sizes, **kwargs) else: Algorithm = PolicyGradient(sess, 0, info_state_size**0.5, num_actions, cnn_parameters, hidden_layers_sizes, **kwargs) agents = [Algorithm, agent.RandomAgent(1)] sess.run(tf.global_variables_initializer()) return agents
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size agentR = agent.RandomAgent(0) hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "epsilon_decay_duration": int(0.6 * FLAGS.num_train_episodes), "epsilon_start": 0.8, "epsilon_end": 0.001, "learning_rate": 1e-3, "learn_every": FLAGS.learn_every, "batch_size": 128, "max_global_gradient_norm": 10, } ret = [0] max_len = 2000 with tf.Session() as sess: dqn = DQN(sess, 0, info_state_size, num_actions, hidden_layers_sizes, **kwargs) dqn.restore("saved_model/10000") for ep in range(10): print("start mcts train ep" + str(ep)) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: #用MCTS root = Node(None, env, time_step, None, 0, 0) mcts = MCTS(root, dqn, random_value_net, random_rollout_net, env, time_limit=5) action_list = mcts.start() else: agent_output = agentR.step(time_step).action action_list = agent_output #获得动作 #print(action_list,player_id) time_step = env.step(action_list) print(time_step.rewards) print('Time elapsed:', time.time() - begin)
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "pi_learning_rate": 1e-2, "critic_learning_rate": 1e-1, "batch_size": 128, "entropy_cost": 0.5, "max_global_gradient_norm": 20, } import agent.agent as agent ret = [0] max_len = 2000 with tf.Session() as sess: # agents = [DQN(sess, _idx, info_state_size, # num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)] agents = [PolicyGradient(sess, 0, info_state_size, num_actions, hidden_layers_sizes, **kwargs), agent.RandomAgent(1)] sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = agents[0].loss logging.info("Episodes: {}: Losses: {}, Rewards: {}".format(ep+1, losses, np.mean(ret))) with open('log_pg_{}'.format(os.environ.get('BOARD_SIZE')), 'a+') as log_file: log_file.writelines("{}, {}\n".format(ep+1, np.mean(ret))) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] ret = [] for ep in range(FLAGS.num_eval): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, is_evaluation=True) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. # for agent in agents: agents[0].step(time_step, is_evaluation=True) agents[1].step(time_step) ret.append(time_step.rewards[0]) print(np.mean(ret)) print('Time elapsed:', time.time()-begin)
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "epsilon_decay_duration": int(0.6 * FLAGS.num_train_episodes), "epsilon_start": 0.8, "epsilon_end": 0.001, "learning_rate": 1e-3, "learn_every": FLAGS.learn_every, "batch_size": 128, "max_global_gradient_norm": 10, } import agent.agent as agent ret = [0] max_len = 2000 with tf.Session() as sess: # agents = [DQN(sess, _idx, info_state_size, # num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)] # for self play agents = [ agent.RandomAgent(1), DQN(sess, 1, info_state_size, num_actions, hidden_layers_sizes, **kwargs) ] sess.run(tf.global_variables_initializer()) # train the agent for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.save_every == 0: if not os.path.exists("saved_model/random_vs_dqn"): os.mkdir('saved_model/random_vs_dqn') agents[1].save(checkpoint_root='saved_model/random_vs_dqn', checkpoint_name='random_vs_dqn_{}'.format(ep + 1)) print('saved %d' % (ep + 1)) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action # print(action_list) time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] # evaluated the trained agent agents[1].restore("saved_model/random_vs_dqn/random_vs_dqn_10000") ret = [] for ep in range(FLAGS.num_eval): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step) else: agent_output = agents[player_id].step( time_step, is_evaluation=True, add_transition_record=False) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. # for agent in agents: agents[0].step(time_step) agents[1].step(time_step, is_evaluation=True, add_transition_record=False) ret.append(time_step.rewards[0]) print(np.mean(ret)) # print(ret) print('Time elapsed:', time.time() - begin)
def main(unused_argv): begin = time.time() env = Go() ret = [0] policy_function = [ 'saved_model/dqn_vs_random/10000', 'saved_model/random_vs_dqn/random_vs_dqn_10000' ] value_function = 'saved_model/dqn_vs_random/10000' agents = [ agent.Net_MCTS_Agent(value_function, policy_function, n_playout=50), agent.RandomAgent(1) ] for ep in range(NUM_TRAIN): if (ep + 1) % NUM_SAVE_EVERY == 0: if not os.path.exists("saved_model/net_mcts_vs_random"): os.mkdir('saved_model/net_mcts_vs_random') agents[0].mcts._policy_fn[0].save( checkpoint_root='saved_model/net_mcts_vs_random', checkpoint_name='_policy_fn_0_{}'.format(ep + 1)) agents[0].mcts._policy_fn[1].save( checkpoint_root='saved_model/net_mcts_vs_random', checkpoint_name='_policy_fn_1_{}'.format(ep + 1)) agents[0].mcts._value_fn.save( checkpoint_root='saved_model/net_mcts_vs_random', checkpoint_name='_value_fn_{}'.format(ep + 1)) time_step = env.reset() # a new env print('start ep: %d' % ep) while not time_step.last(): # play until the game is over cur_player = time_step.observations["current_player"] state = time_step.observations["info_state"][cur_player] player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, env) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) print('end') agents[0].step(time_step, env) agents[1].step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] # evaluated the trained mcts agent ret = [] for ep in range(NUM_EVAL): print('eval ep: %d' % ep) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, env) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. agents[0].step(time_step, env) agents[1].step(time_step) ret.append(time_step.rewards[0]) print(np.mean(ret)) print(ret) print('Time elapsed:', time.time() - begin)
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size num_cnn_layer = len(FLAGS.output_channels) kernel_shapes = [3 for _ in range(num_cnn_layer)] strides = [1 for _ in range(num_cnn_layer)] paddings = ["SAME" for _ in range(num_cnn_layer - 1)] paddings.append("VALID") cnn_parameters = [FLAGS.output_channels, kernel_shapes, strides, paddings] #hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "pi_learning_rate": 3e-4, "critic_learning_rate": 1e-3, "batch_size": 128, "entropy_cost": 0.5, "max_global_gradient_norm": 20, } import agent.agent as agent ret = [0] max_len = 2000 with tf.Session() as sess: # agents = [DQN(sess, _idx, info_state_size, # num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)] agents = [ PolicyGradient(sess, 0, info_state_size**0.5, num_actions, cnn_parameters, hidden_layers_sizes, **kwargs), agent.RandomAgent(1) ] sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = agents[0].loss logging.info("Episodes: {}: Losses: {}, Rewards: {}".format( ep + 1, losses, np.mean(ret))) with open('log_pg_{}'.format(os.environ.get('BOARD_SIZE')), 'a+') as log_file: log_file.writelines("{}, {}\n".format( ep + 1, np.mean(ret))) if (ep + 1) % FLAGS.save_every == 0: agents[0].save(checkpoint_root='saved_model', checkpoint_name='{}'.format(ep + 1)) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] ret = [] agents[0].restore("saved_model/10000") for ep in range(FLAGS.num_eval): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, is_evaluation=True) print(agents[0].policy_fn(time_step, player_id)) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. # for agent in agents: agents[0].step(time_step, is_evaluation=True) agents[1].step(time_step) ret.append(time_step.rewards[0]) print(np.mean(ret)) print('Time elapsed:', time.time() - begin)