def load_model(self): self.data_node = tf.placeholder(tf.float32, shape=(1, INPUT_WIDTH, INPUT_WIDTH, INPUT_DEPTH)) self.sess=tf.Session() if self.is_value_net : from valuenet import ValueNet self.vnet=ValueNet() self.keep_prob_node=tf.placeholder(tf.float32) self.value=self.vnet.model(self.data_node, keep_prob_node=self.keep_prob_node) self.position_values=np.ndarray(dtype=np.float32, shape=(BOARD_SIZE**2,)) else: self.net = SLNetwork() self.net.declare_layers(num_hidden_layer=5) self.logit=self.net.model(self.data_node) saver = tf.train.Saver() saver.restore(self.sess, self.model_path)
def selfplay(self): data_node=tf.placeholder(tf.float32, shape=(1, INPUT_WIDTH, INPUT_WIDTH, INPUT_DEPTH)) slnet=SLNetwork() slnet.declare_layers(num_hidden_layer=8) this_logits=slnet.model(data_node) saver=tf.train.Saver(max_to_keep=MAX_NUM_MODEL_TO_KEEP) print(this_logits.name) sl_model = os.path.join(MODELS_DIR, SLMODEL_NAME) sess = tf.Session() saver.restore(sess, sl_model) with tf.variable_scope("other_nn_player"): slnet2=SLNetwork() slnet2.declare_layers(num_hidden_layer=8) other_logits=slnet2.model(data_node) #use non-scoped name to restore those variables var_dict2 = {slnet.input_layer.weight.op.name: slnet2.input_layer.weight, slnet.input_layer.bias.op.name: slnet2.input_layer.bias} for i in xrange(slnet2.num_hidden_layer): var_dict2[slnet.conv_layer[i].weight.op.name] = slnet2.conv_layer[i].weight var_dict2[slnet.conv_layer[i].bias.op.name] = slnet2.conv_layer[i].bias saver2 = tf.train.Saver(var_list=var_dict2) otherSess = tf.Session() saver2.restore(otherSess, sl_model) batch_game_size=128 batch_reward_node = tf.placeholder(dtype=np.float32, shape=(PG_STATE_BATCH_SIZE,)) batch_data_node = tf.placeholder(dtype=np.float32, shape=(PG_STATE_BATCH_SIZE, INPUT_WIDTH, INPUT_WIDTH, INPUT_DEPTH)) batch_label_node = tf.placeholder(shape=(PG_STATE_BATCH_SIZE,), dtype=np.int32) batch_rewards = np.ndarray(dtype=np.float32, shape=(PG_STATE_BATCH_SIZE,)) batch_data = np.ndarray(dtype=np.float32, shape=(PG_STATE_BATCH_SIZE, INPUT_WIDTH, INPUT_WIDTH, INPUT_DEPTH)) batch_labels = np.ndarray(shape=(PG_STATE_BATCH_SIZE,), dtype=np.int32) game_rewards=np.ndarray(shape=(batch_game_size,), dtype=np.float32) tf.get_variable_scope().reuse_variables() logit = slnet.model(batch_data_node) entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, batch_label_node) loss = tf.reduce_mean(tf.mul(batch_reward_node, entropy)) opt = tf.train.GradientDescentOptimizer(FLAGS.alpha / batch_game_size) opt_op=opt.minimize(loss) win1=0 win2=0 ite=0 g_step=0 if not os.path.exists(MODELS_DIR): os.makedirs(MODELS_DIR) print("no SL model? creating pg model dir") else: for f in os.listdir(MODELS_DIR): if f.startswith(PGMODEL_NAME): try: os.remove(os.path.join(MODELS_DIR,f)) except OSError as e: print(e.strerror) print("removing old models in pg dir") while ite < FLAGS.max_iterations: start_batch_time = time.time() games,_tmp1,_tmp2=self.play_one_batch_games(sess,otherSess, this_logits,other_logits,data_node,batch_game_size, game_rewards) win1 += _tmp1 win2 += _tmp2 data_tool=data_util(games, PG_STATE_BATCH_SIZE, batch_data, batch_labels) data_tool.disable_symmetry_checking() offset1=0; offset2=0; nepoch=0 while nepoch <1 : o1,o2,next_epoch=data_tool.prepare_batch(offset1,offset2) if(next_epoch): nepoch += 1 k=0 batch_rewards.fill(0) new_o1, new_o2=0,0 while k < PG_STATE_BATCH_SIZE: for i in range(offset1, batch_game_size): R=game_rewards[i] sign=1 if offset2 % 2 ==0 else -1 for j in range(offset2, len(games[i])-1): batch_rewards[k]=R*sign*(FLAGS.gamma**(len(games[i])-2-j)) sign=-sign k += 1 if(k>=PG_STATE_BATCH_SIZE): new_o1=i new_o2=j+1 break offset2=0 if(k>=PG_STATE_BATCH_SIZE): break if k<PG_STATE_BATCH_SIZE: offset1, offset2 = 0,0 assert(new_o1==o1 and new_o2==o2) offset1, offset2=o1,o2 sess.run(opt_op, feed_dict={batch_data_node:batch_data, batch_label_node:batch_labels, batch_reward_node: batch_rewards}) print("time cost for one batch of %d games"%PG_GAME_BATCH_SIZE, time.time()-start_batch_time) ite += 1 if ite % FLAGS.frequency==0: saver.save(sess, os.path.join(MODELS_DIR, PGMODEL_NAME), global_step=g_step) pg_model=self.select_model(MODELS_DIR) saver2.restore(otherSess, pg_model) g_step +=1 print("Replce opponenet with new model, ", g_step) print(pg_model) ite += 1 print("In total, this win", win1, "opponenet win", win2) otherSess.close() sess.close()
class NNAgent(object): def __init__(self, model_location, name, is_value_net=False): self.model_path=model_location self.agent_name=name self.is_value_net=is_value_net self.initialize_game() def initialize_game(self): self.game_state=[] self.boardtensor=np.zeros(dtype=np.float32, shape=(1, INPUT_WIDTH, INPUT_WIDTH, INPUT_DEPTH)) make_empty_board_tensor(self.boardtensor) self.load_model() def load_model(self): self.data_node = tf.placeholder(tf.float32, shape=(1, INPUT_WIDTH, INPUT_WIDTH, INPUT_DEPTH)) self.sess=tf.Session() if self.is_value_net : from valuenet import ValueNet self.vnet=ValueNet() self.keep_prob_node=tf.placeholder(tf.float32) self.value=self.vnet.model(self.data_node, keep_prob_node=self.keep_prob_node) self.position_values=np.ndarray(dtype=np.float32, shape=(BOARD_SIZE**2,)) else: self.net = SLNetwork() self.net.declare_layers(num_hidden_layer=5) self.logit=self.net.model(self.data_node) saver = tf.train.Saver() saver.restore(self.sess, self.model_path) def reinitialize(self): self.game_state = [] make_empty_board_tensor(self.boardtensor) #0-black player, 1-white player def play_move(self, intplayer, intmove): update_tensor(self.boardtensor, intplayer, intmove) self.game_state.append(intmove) def generate_move(self, intplayer=None): if self.is_value_net : s=list(self.game_state) empty_positions=[i for i in range(BOARD_SIZE**2) if i not in s] self.position_values.fill(0.0) for intmove in empty_positions: update_tensor(self.boardtensor, intplayer, intmove) v=self.sess.run(self.value, feed_dict={self.data_node:self.boardtensor}) undo_update_tensor(self.boardtensor,intplayer, intmove) self.position_values[intmove]=v im=softmax_selection(self.position_values, self.game_state, temperature=0.1) #im=max_selection(self.position_values, self.game_state) return im else: logits=self.sess.run(self.logit, feed_dict={self.data_node:self.boardtensor}) intmove=softmax_selection(logits, self.game_state) #intmove=max_selection(logits, self.game_state) raw_move=intmove_to_raw(intmove) assert(ord('a') <= ord(raw_move[0]) <= ord('z') and 0<= int(raw_move[1:]) <BOARD_SIZE**2) return raw_move def close_all(self): self.sess.close()