def train(agents, env, ret, max_len, begin): logging.info("Train on " + fmt_hyperparameters()) global_ep = 0 global_ep = restore_model(agents) # global_ep = restore_model(agents,"./used_model/38000") try: for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: # logging.info("Train on " + fmt_output_channels()) prt_logs(global_ep + ep, agents, ret, begin) if (ep + 1) % FLAGS.save_every == 0: save_model(global_ep + ep, agents) time_step = env.reset() # a go.Position object # i = 0 while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step( time_step, is_rival=(player_id == RIVAL_AGENT)) action_list = agent_output.action # print(player_id) # print(action_list) time_step = env.step(action_list) # i+=1 # logging.info("Timestep in one game: {}".format(i)) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] except KeyboardInterrupt: save_model(global_ep + ep, agents)
def train(agents, env, ret, max_len, begin): global_ep = restore_model(agents) # global_ep = restore_model(agents,"./used_model/38000") try: for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: prt_logs(global_ep + ep, agents, ret, begin) if (ep + 1) % FLAGS.save_every == 0: save_model(global_ep + ep, agents) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] except KeyboardInterrupt: save_model(global_ep + ep, agents)
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "pi_learning_rate": 1e-2, "critic_learning_rate": 1e-1, "batch_size": 128, "entropy_cost": 0.5, "max_global_gradient_norm": 20, } import agent.agent as agent ret = [0] max_len = 2000 with tf.Session() as sess: # agents = [DQN(sess, _idx, info_state_size, # num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)] agents = [PolicyGradient(sess, 0, info_state_size, num_actions, hidden_layers_sizes, **kwargs), agent.RandomAgent(1)] sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = agents[0].loss logging.info("Episodes: {}: Losses: {}, Rewards: {}".format(ep+1, losses, np.mean(ret))) with open('log_pg_{}'.format(os.environ.get('BOARD_SIZE')), 'a+') as log_file: log_file.writelines("{}, {}\n".format(ep+1, np.mean(ret))) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] ret = [] for ep in range(FLAGS.num_eval): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, is_evaluation=True) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. # for agent in agents: agents[0].step(time_step, is_evaluation=True) agents[1].step(time_step) ret.append(time_step.rewards[0]) print(np.mean(ret)) print('Time elapsed:', time.time()-begin)
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "epsilon_decay_duration": int(0.6 * FLAGS.num_train_episodes), "epsilon_start": 0.8, "epsilon_end": 0.001, "learning_rate": 1e-3, "learn_every": FLAGS.learn_every, "batch_size": 128, "max_global_gradient_norm": 10, } import agent.agent as agent ret = [0] max_len = 2000 with tf.Session() as sess: # agents = [DQN(sess, _idx, info_state_size, # num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)] # for self play agents = [ agent.RandomAgent(1), DQN(sess, 1, info_state_size, num_actions, hidden_layers_sizes, **kwargs) ] sess.run(tf.global_variables_initializer()) # train the agent for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.save_every == 0: if not os.path.exists("saved_model/random_vs_dqn"): os.mkdir('saved_model/random_vs_dqn') agents[1].save(checkpoint_root='saved_model/random_vs_dqn', checkpoint_name='random_vs_dqn_{}'.format(ep + 1)) print('saved %d' % (ep + 1)) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action # print(action_list) time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] # evaluated the trained agent agents[1].restore("saved_model/random_vs_dqn/random_vs_dqn_10000") ret = [] for ep in range(FLAGS.num_eval): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step) else: agent_output = agents[player_id].step( time_step, is_evaluation=True, add_transition_record=False) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. # for agent in agents: agents[0].step(time_step) agents[1].step(time_step, is_evaluation=True, add_transition_record=False) ret.append(time_step.rewards[0]) print(np.mean(ret)) # print(ret) print('Time elapsed:', time.time() - begin)
def main(unused_argv): begin = time.time() env = Go() info_state_size = env.state_size num_actions = env.action_size num_cnn_layer = len(FLAGS.output_channels) kernel_shapes = [3 for _ in range(num_cnn_layer)] strides = [1 for _ in range(num_cnn_layer)] paddings = ["SAME" for _ in range(num_cnn_layer - 1)] paddings.append("VALID") cnn_parameters = [FLAGS.output_channels, kernel_shapes, strides, paddings] #hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "pi_learning_rate": 3e-4, "critic_learning_rate": 1e-3, "batch_size": 128, "entropy_cost": 0.5, "max_global_gradient_norm": 20, } import agent.agent as agent ret = [0] max_len = 2000 with tf.Session() as sess: # agents = [DQN(sess, _idx, info_state_size, # num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)] agents = [ PolicyGradient(sess, 0, info_state_size**0.5, num_actions, cnn_parameters, hidden_layers_sizes, **kwargs), agent.RandomAgent(1) ] sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = agents[0].loss logging.info("Episodes: {}: Losses: {}, Rewards: {}".format( ep + 1, losses, np.mean(ret))) with open('log_pg_{}'.format(os.environ.get('BOARD_SIZE')), 'a+') as log_file: log_file.writelines("{}, {}\n".format( ep + 1, np.mean(ret))) if (ep + 1) % FLAGS.save_every == 0: agents[0].save(checkpoint_root='saved_model', checkpoint_name='{}'.format(ep + 1)) time_step = env.reset() # a go.Position object while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) for agent in agents: agent.step(time_step) if len(ret) < max_len: ret.append(time_step.rewards[0]) else: ret[ep % max_len] = time_step.rewards[0] ret = [] agents[0].restore("saved_model/10000") for ep in range(FLAGS.num_eval): time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == 0: agent_output = agents[player_id].step(time_step, is_evaluation=True) print(agents[0].policy_fn(time_step, player_id)) else: agent_output = agents[player_id].step(time_step) action_list = agent_output.action time_step = env.step(action_list) # Episode is over, step all agents with final info state. # for agent in agents: agents[0].step(time_step, is_evaluation=True) agents[1].step(time_step) ret.append(time_step.rewards[0]) print(np.mean(ret)) print('Time elapsed:', time.time() - begin)