def parrallelized(self, state: KalahEnvironment) -> Move: if len(state.get_valid_moves()) == 1: return state.get_valid_moves()[0] root = Node(state=KalahEnvironment.clone(state)) process_list = [] ensemble_count = 24 output_queue = Queue(ensemble_count) for proc in range(ensemble_count): worker_proc = Process(target=self.find_next_move, args=(root.state, output_queue)) worker_proc.daemon = True process_list.append(worker_proc) worker_proc.start() for worker in process_list: worker.join() options = [] rewards = [] for _ in range(ensemble_count): output = output_queue.get() options.append(output[0]) rewards.append(output[1]) return options[rewards.index(max(rewards))]
def find_next_move(self, state: KalahEnvironment, output_queue): if len(state.get_valid_moves()) == 1: return state.get_valid_moves()[0] root = Node(state=KalahEnvironment.clone(state)) time = datetime.datetime.utcnow() while datetime.datetime.utcnow() - time < self.run_duration: node = self.policies.select(root) ending_state = self.policies.simulate(node) self.policies.backpropagate(node, ending_state) chosen = utilities.select_child(root) output_queue.put([chosen.move, chosen.reward])
def evaluate(self, env: KalahEnvironment) -> (float, float): flip_board = env.side_to_play == Side.NORTH state = env.board.get_board_image(flipped=flip_board) mask = env.get_mask() dist, _, value = self.network.evaluate_move(state=state, mask=mask) return dist, float(value)
def expand(self, parent: Node) -> Node: child_expansion_move = choice(tuple(parent.unexplored_moves)) child_state = KalahEnvironment.clone(parent.state) child_state.do_move(child_expansion_move) child_node = Node(state=child_state, move=child_expansion_move, parent=parent) parent.put_child(child_node) return child_node
def __init__(self, state: KalahEnvironment, move: Move = None, parent=None): self.parent = parent self.state = state self.children = [] self.unexplored_moves = set(state.get_valid_moves()) self.visits = 0 self.reward = 0 self.move = move
def backpropagate(self, root: Node, final_state: KalahEnvironment): node = root while node is not None: side = node.parent.state.side_to_play if node.parent is not None else node.state.side_to_play # root node node.update(final_state.get_reward_for_winning(side)) node = node.parent
def __init__(self, env, task): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = env self.task = task # Performance statistics self.episodes_reward = [] self.episodes_length = [] self.episodes_mean_value = [] self.wins = 0 self.games = 0 worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): # self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n) replace Open AI policy self.network = ACNetwork(state_shape=[2, 8, 1], num_act=7) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): # self.local_network = pi = LSTMPolicy(env.observation_space.shape, env.action_space.n) replace Open AI policy self.local_network = pi = self.network pi.global_step = self.global_step self.action = tf.placeholder(shape=[None], dtype=tf.int32) self.action_one_hot = tf.one_hot(self.action, 7, dtype=tf.float32) self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantage = tf.placeholder(shape=[None], dtype=tf.float32) log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) act_log_prob = tf.reduce_sum(log_prob_tf * self.action_one_hot, [1]) # loss of value function self.value_loss = 0.5 * tf.reduce_sum( tf.square(self.target_v - tf.reshape(pi.value, [-1]))) self.entropy = -tf.reduce_sum(prob_tf * log_prob_tf) self.policy_loss = -tf.reduce_sum(act_log_prob * self.advantage) self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 # Get gradients from local network using local losses and clip them to avoid exploding gradients self.gradients = tf.gradients(self.loss, pi.vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 100.0) # Define operation for downloading the weights from the parameter server (ps) # on the local model of the worker self.down_sync = tf.group( *[v1.assign(v2) for v1, v2 in zip(pi.vars, self.network.vars)]) # Define the training operation which applies the gradients on the parameter server network (up sync) optimiser = tf.train.RMSPropOptimizer(learning_rate=0.0007) grads_and_global_vars = list(zip(grads, self.network.vars)) inc_step = self.global_step.assign_add(tf.shape(self.action)[0]) self.train_op = tf.group( *[optimiser.apply_gradients(grads_and_global_vars), inc_step]) # 20 represents the number of "local steps": the number of timesteps # we run the policy before we update the parameters. # The larger local steps is, the lower is the variance in our policy gradients estimate # on the one hand; but on the other hand, we get less frequent parameter updates, which # slows down learning. In this code, we found that making local steps be much # smaller than 20 makes the algorithm more difficult to tune and to get to work. self.env_runner = RunnerThread(KalahEnvironment(), pi) episode_size = tf.to_float(tf.shape(pi.value)[0]) if use_tf12_api: tf.summary.scalar("model/policy_loss", self.policy_loss / episode_size) tf.summary.scalar("model/value_loss", self.value_loss / episode_size) tf.summary.scalar("model/entropy", self.entropy / episode_size) tf.summary.scalar("model/grad_global_norm", self.grad_norms) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.vars)) self.summary_op = tf.summary.merge_all() else: tf.scalar_summary("model/policy_loss", self.policy_loss / episode_size) tf.scalar_summary("model/value_loss", self.value_loss / episode_size) tf.scalar_summary("model/entropy", self.entropy / episode_size) tf.scalar_summary("model/grad_global_norm", self.grad_norms) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.vars)) self.summary_op = tf.merge_all_summaries() self.summary_writer = None self.local_steps = 0
def main(): policies = MonteCarloA3CPolicies() mcts = MCTS(run_duration=20, policies=policies) state = KalahEnvironment() run(mcts, state)
def run(args, server): env = KalahEnvironment() trainer = A3C(env, args.task) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) logdir = os.path.join(args.log_dir, 'train_result_a3c_final') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor(is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) # scaffold = tf.train_result_a3c.Scaffold(init_op=init_op, # init_feed_dict=None, # init_fn=init_fn, # ready_op=tf.report_uninitialized_variables(variables_to_save), # ready_for_local_init_op=None, # local_init_op=None, # summary_op=None, # saver=saver, # copy_from_scaffold=None # ) # summary_saver = tf.train_result_a3c.SummarySaverHook(save_steps=None, # save_secs=30, # output_dir=logdir, # summary_writer=summary_writer, # scaffold=None, # summary_op=None) # # sv = tf.train_result_a3c.MonitoredTrainingSession( # master='', # is_chief=(args.task == 0), # checkpoint_dir=logdir, # scaffold=scaffold, # hooks=summary_saver, # chief_only_hooks=None, # save_checkpoint_secs=30, # save_summaries_steps=None, # save_summaries_secs=30, # config=None, # stop_grace_period_secs=120, # log_step_count_steps=trainer.global_step, # max_wait_secs=7200, # save_checkpoint_steps=None, # summary_dir=logdir # ) num_global_steps = 400000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.") with sv.managed_session(server.target, config=config) as sess, sess.as_default(): checkpoint_path = tf.train.get_checkpoint_state(checkpoint_dir="tmp/logs-2/train") saver.restore(sess=sess, save_path=checkpoint_path.model_checkpoint_path) sess.run(trainer.down_sync) global_step = sess.run(trainer.global_step) logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.play(sess, RandomAgent(), summary_writer) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def sample(self, env: KalahEnvironment) -> (int, float): flip_board = env.side_to_play == Side.NORTH state = env.board.get_board_image(flipped=flip_board) mask = env.get_mask() return self.network.sample(state=state, mask=mask)