def train_value_network(self, rpc, sample_num=1000, max_time_steps=225, epochs=20, batch_size=32): """ :param policy_dl: policy network of deep learning :param policy_rl: policy network of reinforcement learning :return: """ model_params = self.param_unserierlize(init_params={"global_step": 0, "global_epoch": 0}) if sample_num > 0: # create sample start_time = time.time() sample_file = "data/value_net_phase_%d_samples_%d.pkl" % (self.phase, sample_num) sample_games = sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=max_time_steps) elapsed_time = int((time.time() - start_time) * 1000) logger.info("sampling for value network, samples=%d, time=%d(ms)" % (sample_num, elapsed_time)) cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2) logger.info("save sample file: %s" % sample_file) model_params["sample_file"] = sample_file self.param_serierlize(model_params) else: # load old sample if 'sample_file' not in model_params: logger.error("not found sample file", to_exit=True) sample_games = cPickle.load(open(model_params["sample_file"], 'rb')) epoch_step, train_step = model_params["global_epoch"], model_params["global_step"] while epoch_step < (model_params["global_epoch"] + epochs): start_time = time.time() epoch_step += 1 random.shuffle(sample_games) avg_loss = 0.0 for idx in xrange(0, len(sample_games), batch_size): end_idx = min(len(sample_games), idx + batch_size) mini_samples = sample_games[idx: end_idx] # transform sample data mini_states = [sampled_game.get_states(player_plane=True) for sampled_game, _ in mini_samples] mini_rewards = [sampled_reward for _, sampled_reward in mini_samples] fetch_status = self.fit(mini_states, mini_rewards, fetch_info=True) _, train_step, loss = fetch_status avg_loss += loss train_step = int(train_step) if train_step % 20 == 0: elapsed_time = int((time.time() - start_time) * 1000) logger.info( "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)" % (self.phase, epoch_step, train_step, loss, elapsed_time)) start_time = time.time() avg_loss /= math.ceil(len(sample_games) / batch_size) logger.info("train value network, phase=%d, epoch=%d, avg_loss=%.6f" % (self.phase, epoch_step, avg_loss)) if epoch_step % 5 == 0: # save model model_params["global_step"] = train_step model_params["global_epoch"] = epoch_step self.param_serierlize(model_params) model_file = self.save_model("value_net_phase_%d" % self.phase, global_step=model_params["global_step"]) logger.info("save value network model, file=%s" % model_file)
def load_model(args, model_type, model_file=None): policy_planes = args.policy_planes value_planes = args.value_planes pattern_features = args.pattern_features if model_type == "policy_dl": model = PolicyDLNetwork(policy_planes, corpus, args, filters=args.policy_dl_filters, board_size=args.board_size, model_dir=args.policy_dl_models_dir, device="gpu", gpu=args.policy_dl_gpu, optimizer=args.policy_dl_optimizer, learn_rate=args.policy_dl_learn_rate, distributed_train=False, ) elif model_type == "policy_rollout": model = PolicyRolloutModel(policy_planes, patterns, args, board_size=args.board_size, model_dir=args.policy_rollout_models_dir, device="cpu", optimizer=args.policy_rollout_optimizer, learn_rate=args.policy_rollout_learn_rate, distributed_train=False, ) elif model_type == "policy_rl": model = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, device="cpu", optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) elif model_type == "value_net": model = ValueNetwork(value_planes, args, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, device="cpu", optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: logger.error("unsupported model type=%s" % model_type, to_exit=True) # init session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options)) session.run(tf.initialize_all_variables()) model.set_session(session) # restore model status = model.restore_model(model_file=model_file) if not status and model_type == "policy_rl": checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) model_file = checkpoint.model_checkpoint_path logger.info("successful load model file: %s" % model_file) model.saver.restore(session, model_file) return model
def train_policy_network_rl(args): policy_planes = args.policy_planes # rpc of value_net rpc = ModelRPC(args) if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork( policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) # init session session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) session.run(tf.initialize_all_variables()) policy_rl.set_session(session) # restore model if exist if model_file is not None: policy_rl.saver.restore(session, model_file) logger.info("load model file: %s" % model_file) policy_rl.save_model("policy_rl", global_step=0) else: policy_rl.restore_model() # train policy rl policy_rl.train_policy_network(rpc, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def train_policy_network(self, corpus, epochs=20, batch_size=64, save_step=5): """ :param states: [array(15, 15, planes)] :param actions: [one_hot_list(0~225),] :return: """ start_time = time.time() params = self.param_unserierlize(init_params={ "epoch": 0, "global_step": 0 }) global_epoch, global_step = int(params["epoch"]), int( params["global_step"]) epochs_step = global_epoch while epochs_step < (global_epoch + epochs): epochs_step += 1 average_loss = 0.0 average_acc = 0.0 local_step = 0 corpus.shuffle_datas() elapsed_time = 0.0 for samples in corpus.iterator_fetch_rows(batch_size): sample_states = [sample[0].get_states() for sample in samples] sample_actions = [ one_hot_action(sample[1]) for sample in samples ] start_time = time.time() fetch_status = self.fit(sample_states, sample_actions, fetch_info=True) elapsed_time += int((time.time() - start_time) * 1000) _, global_step, loss, acc, lr = fetch_status # record loss local_step += 1 average_loss += loss average_acc += acc # record time if global_step % 8 == 0: logger.info( "train policy dl network, epochs=%d, global_step=%d, loss=%.7f, avg_loss=%.7f, acc=%.7f, avg_acc=%.7f, lr=%.7f, time=%d(ms)" % (epochs_step, global_step, loss, average_loss / local_step, acc, average_acc / local_step, lr, elapsed_time)) elapsed_time = 0 logger.info( "train policy dl network, epochs=%d, average_loss=%.7f, average_acc=%.7f" % (epochs_step, average_loss / local_step, average_acc / local_step)) if epochs_step % save_step == 0: # save model self.param_serierlize({ "epoch": int(epochs_step), "global_step": int(global_step) }) filename = self.save_model("policy_dl_epoch_%d" % epochs_step, global_step=global_step) logger.info("save policy dl model: %s" % filename)
def train_policy_network_rl(args): policy_planes = args.policy_planes # rpc of value_net rpc = ModelRPC(args) if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) # init session session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) session.run(tf.initialize_all_variables()) policy_rl.set_session(session) # restore model if exist if model_file is not None: policy_rl.saver.restore(session, model_file) logger.info("load model file: %s" % model_file) policy_rl.save_model("policy_rl", global_step=0) else: policy_rl.restore_model() # train policy rl policy_rl.train_policy_network(rpc, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def import_RenjuNet(self, file_path): if not os.path.exists(file_path): logger.error("not found file: %s" % file_path, to_exit=True) # read xml file bs_tree = BeautifulSoup(open(file_path, 'r').read()) games = bs_tree.find_all("game") # insert moves game_num = len(games) move_count = 0 step = 0 for game in games: step += 1 gid = int(game.attrs["id"]) moves = game.move.text.strip().replace("%20", " ").split(" ") if len(self.db.query("select id from renju WHERE gid=?", gid)) > 0: # when gid exists continue renju_game = RenjuGame() for mid, move in enumerate(moves): move = move.strip() if move == "": continue board_stream = board_to_stream(renju_game.board) player = renju_game.player row = ord(move[0]) - ord('a') col = int(move[1:]) - 1 action = renju_game.transform_action((row, col)) # insert self.db.execute("insert INTO renju (gid, mid, board, player, action) VALUES (?, ?, ?, ?, ?)", gid, mid, board_stream, player, action) # do move renju_game.do_move((row, col)) move_count += len(moves) if step % 100 == 0: print "load games= %d / %d" % (step, game_num) logger.info("newly insert games=%d, moves=%d" % (game_num, move_count)) print "finish import moves"
def train_policy_network(self, corpus, epochs=20, batch_size=64, save_step=5): """ :param states: [array(15, 15, planes)] :param actions: [one_hot_list(0~225),] :return: """ start_time = time.time() params = self.param_unserierlize(init_params={"epoch": 0, "global_step": 0}) global_epoch, global_step = int(params["epoch"]), int(params["global_step"]) epochs_step = global_epoch while epochs_step < (global_epoch + epochs): epochs_step += 1 average_loss = 0.0 average_acc = 0.0 local_step = 0 corpus.shuffle_datas() elapsed_time = 0.0 for samples in corpus.iterator_fetch_rows(batch_size): sample_states = [sample[0].get_states() for sample in samples] sample_actions = [one_hot_action(sample[1]) for sample in samples] start_time = time.time() fetch_status = self.fit(sample_states, sample_actions, fetch_info=True) elapsed_time += int((time.time() - start_time) * 1000) _, global_step, loss, acc, lr = fetch_status # record loss local_step += 1 average_loss += loss average_acc += acc # record time if global_step % 8 == 0: logger.info("train policy dl network, epochs=%d, global_step=%d, loss=%.7f, avg_loss=%.7f, acc=%.7f, avg_acc=%.7f, lr=%.7f, time=%d(ms)" % (epochs_step, global_step, loss, average_loss / local_step, acc, average_acc / local_step, lr, elapsed_time)) elapsed_time = 0 logger.info("train policy dl network, epochs=%d, average_loss=%.7f, average_acc=%.7f" % (epochs_step, average_loss / local_step, average_acc / local_step)) if epochs_step % save_step == 0: # save model self.param_serierlize({"epoch": int(epochs_step), "global_step": int(global_step)}) filename = self.save_model("policy_dl_epoch_%d" % epochs_step, global_step=global_step) logger.info("save policy dl model: %s" % filename)
def sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=225): """ :param max_steps: max time steps in games :return: """ sample_games = [] if os.path.exists(sample_file): sample_games = cPickle.load(open(sample_file, 'rb')) logger.info("load sample file: %s, samples=%d" % (sample_file, len(sample_games))) sample_sets = set() # used to check unique sample game = RenjuGame() record_policy_dl_boards = [] # move step by policy dl game.reset_game() record_policy_dl_boards.append(game.replicate_game()) while True: action = game.choose_action( rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name())) if action is None: break state, _, terminal = game.step_games(action) if terminal: break record_policy_dl_boards.append(game.replicate_game()) max_time_steps = min(max_time_steps, len(record_policy_dl_boards)) - 1 # sample game while len(sample_games) < sample_num: sampled_game = None while True: # loop to find legal sample flag_time_step = random.randint(1, max_time_steps) recorded_game = record_policy_dl_boards[flag_time_step - 1].replicate_game() random_action = recorded_game.random_action() if random_action is None: break random_state, _, terminal = recorded_game.step_games(random_action) if not terminal and not str(random_state) in sample_sets: sample_sets.add(str(random_state)) break if random_action is None: # invalid loop continue # move step by policy rl time_step = flag_time_step while True: # simulate game by policy rl actions = rpc.policy_rl_rpc(board_to_stream(recorded_game.board), recorded_game.get_player_name()) action = recorded_game.choose_action(actions) if action is None: # game drawn sampled_reward = 0 break state, reward, terminal = recorded_game.step_games(action) time_step += 1 if time_step == (flag_time_step + 1): # record board sampled_game = recorded_game.replicate_game() if terminal: # record value sampled_reward = reward break if sampled_game is not None: sample_games.append((sampled_game, sampled_reward)) logger.info("sample simulate, sample_step=%d, time_step=%d" % (len(sample_games), time_step)) if len(sample_games) % 100 == 0: cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2) logger.info("create value network sample, step=%d" % len(sample_games)) return sample_games
def train_value_network(self, rpc, sample_num=1000, max_time_steps=225, epochs=20, batch_size=32): """ :param policy_dl: policy network of deep learning :param policy_rl: policy network of reinforcement learning :return: """ model_params = self.param_unserierlize(init_params={ "global_step": 0, "global_epoch": 0 }) if sample_num > 0: # create sample start_time = time.time() sample_file = "data/value_net_phase_%d_samples_%d.pkl" % ( self.phase, sample_num) sample_games = sampling_for_value_network( rpc, sample_num, sample_file, max_time_steps=max_time_steps) elapsed_time = int((time.time() - start_time) * 1000) logger.info("sampling for value network, samples=%d, time=%d(ms)" % (sample_num, elapsed_time)) cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2) logger.info("save sample file: %s" % sample_file) model_params["sample_file"] = sample_file self.param_serierlize(model_params) else: # load old sample if 'sample_file' not in model_params: logger.error("not found sample file", to_exit=True) sample_games = cPickle.load(open(model_params["sample_file"], 'rb')) epoch_step, train_step = model_params["global_epoch"], model_params[ "global_step"] while epoch_step < (model_params["global_epoch"] + epochs): start_time = time.time() epoch_step += 1 random.shuffle(sample_games) avg_loss = 0.0 for idx in xrange(0, len(sample_games), batch_size): end_idx = min(len(sample_games), idx + batch_size) mini_samples = sample_games[idx:end_idx] # transform sample data mini_states = [ sampled_game.get_states(player_plane=True) for sampled_game, _ in mini_samples ] mini_rewards = [ sampled_reward for _, sampled_reward in mini_samples ] fetch_status = self.fit(mini_states, mini_rewards, fetch_info=True) _, train_step, loss = fetch_status avg_loss += loss train_step = int(train_step) if train_step % 20 == 0: elapsed_time = int((time.time() - start_time) * 1000) logger.info( "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)" % (self.phase, epoch_step, train_step, loss, elapsed_time)) start_time = time.time() avg_loss /= math.ceil(len(sample_games) / batch_size) logger.info( "train value network, phase=%d, epoch=%d, avg_loss=%.6f" % (self.phase, epoch_step, avg_loss)) if epoch_step % 5 == 0: # save model model_params["global_step"] = train_step model_params["global_epoch"] = epoch_step self.param_serierlize(model_params) model_file = self.save_model( "value_net_phase_%d" % self.phase, global_step=model_params["global_step"]) logger.info("save value network model, file=%s" % model_file)
def train_policy_network(self, rpc, batch_games=128, save_step=50000, max_model_pools=5, init_epsilon=0.5, final_epsilon=0.05, explore=1000000, action_repeat=20, mini_batch_size=128): """ data set from self-play :return: """ game = RenjuGame() batch_states, batch_actions, batch_rewards = deque(), deque(), deque() mini_batch_states, mini_batch_actions, mini_batch_rewards = [ 0 ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size model_pools = [] params = self.param_unserierlize(init_params={ "global_step": 0, "epsilon": init_epsilon }) global_step_val, epsilon = params["global_step"], params["epsilon"] train_step = 0 while True: start_time = time.time() # choose policy network for opponent player from model pools if train_step % 10 == 0: if len(model_pools) > 0: model_file = random.choice(model_pools) else: model_file = None rpc.switch_model("policy_rl", model_file=model_file) while len(batch_states) < batch_games: # opponent_policy = self.load_history_policy_model(model_file) black_opponent = random.choice([True, False]) # reset game game.reset_game() # simulate game by current parameter states, actions, rewards = [], [], [] state = game.step_games(None) while True: # loop current game # self-play, current model V.S. history model if random.random() < epsilon: # random choose action action = game.random_action() else: if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \ or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE): action = game.choose_action( rpc.policy_rl_rpc(board_to_stream(game.board), game.get_player_name())) else: # current player action = game.choose_action( self.predict([state])[0]) # step game state_n, reward_n, terminal_n = game.step_games(action) # print "game=", batch_step, ", move=", transform_action(action) # store (state, action) states.append(state) one_hot_act = one_hot_action(action) actions.append(one_hot_act) # set new states state = state_n if terminal_n: final_reward = reward_n # logger.info("winner=%s" % ("black" if reward_n > 0 else "white")) break # check whether game drawn if game.random_action( ) is None: # game drawn, equal end, reward=0 final_reward = 0 logger.info("game drawn, so amazing...") break # store (reward) for step in xrange(len(states)): if step % 2 == 0: rewards.append(final_reward) else: rewards.append(-final_reward) # store states of ith game batch_states.append(states) batch_actions.append(actions) batch_rewards.append(rewards) # fit model by mini batch avg_loss, avg_acc = 0.0, 0.0 for _ in xrange(action_repeat): train_step += 1 for idx in xrange(mini_batch_size): game_idx = random.randint(0, len(batch_states) - 1) game_time_step_idx = random.randint( 0, len(batch_states[game_idx]) - 1) mini_batch_states[idx] = batch_states[game_idx][ game_time_step_idx] mini_batch_actions[idx] = batch_actions[game_idx][ game_time_step_idx] mini_batch_rewards[idx] = batch_rewards[game_idx][ game_time_step_idx] _, global_step_val, loss, acc = self.fit(mini_batch_states, mini_batch_actions, mini_batch_rewards, fetch_info=True) avg_loss += loss avg_acc += acc # update epsilon if epsilon > final_epsilon: epsilon -= (init_epsilon - final_epsilon) / explore avg_loss /= action_repeat avg_acc /= action_repeat batch_states.popleft() batch_actions.popleft() batch_rewards.popleft() global_step_val = int(global_step_val) elapsed_time = int(time.time() - start_time) logger.info( "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" % (global_step_val, epsilon, avg_loss, avg_acc, elapsed_time)) # save model if train_step % save_step == 0: params["global_step"], params[ "epsilon"] = global_step_val, epsilon self.param_serierlize(params) model_file = self.save_model("policy_rl", global_step=global_step_val) logger.info("save policy dl model, file=%s" % model_file) model_file = model_file[len(self.model_dir):] # add history model to pool model_pools.append(model_file) if len(model_pools ) > max_model_pools: # pop head when model pools exceed model_pools.pop(0) logger.info("model pools has files: [%s]" % (", ".join(model_pools)))
def train(epochs=200): param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={"epoch": 0, "global_step": 0}) global_epoch, global_step_val = int(params["epoch"]), int(params["global_step"]) """Train for a number of steps.""" with tf.Graph().as_default(), tf.device('/job:ps/task:0/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(global_step_val), trainable=False) # Calculate the learning rate schedule. num_batchs_per_epochs = corpus.num_batchs_per_epochs(BATCH_SIZE) print("num_batches_per_epoch: %d" % num_batchs_per_epochs) decay_steps = int(num_batchs_per_epochs * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] tower_acc = [] for i in xrange(len(CLUSTER_CONFIG["worker_hosts"])): gpu_device = CLUSTER_CONFIG["worker_hosts"][i][1] with tf.device('/job:worker/task:%d/%s' % (i, gpu_device)): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: # all towers. loss = tower_loss(scope, CLUSTER_CONFIG["worker_hosts"][i][2]) # all accuracy tower_acc.append(tf.get_collection('accuracy', scope)[0]) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # average accuracy accuracy = tf.add_n(tower_acc) / len(tower_acc) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) sess = tf.Session("grpc://" + CLUSTER_CONFIG["worker_hosts"][0][0], config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options)) sess.run(init) # restore model restore_model(sess, train_dir, saver) # Start the queue runners. tf.train.start_queue_runners(sess=sess) graph_def = sess.graph.as_graph_def(add_shapes=True) summary_writer = tf.train.SummaryWriter(train_dir, graph_def=graph_def) avg_loss, avg_acc = [0] * num_batchs_per_epochs, [0] * num_batchs_per_epochs epochs_step = global_epoch + 1 step = 0 while epochs_step <= (global_epoch + epochs): step += 1 start_time = time.time() _, loss_value, acc_value, global_step_val = sess.run([train_op, loss, accuracy, global_step]) elapsed_time = int((time.time() - start_time) * 1000) avg_loss[step % num_batchs_per_epochs] = loss_value avg_acc[step % num_batchs_per_epochs] = acc_value global_step_val = int(global_step_val) if global_step_val % 2 == 0: logger.info("train policy dl dist network, epoch=%d, step=%d, loss=%.6f, acc=%.6f, time=%d(ms)" % ( epochs_step, step, loss_value, acc_value, elapsed_time)) if global_step_val % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step > num_batchs_per_epochs: step = step % num_batchs_per_epochs epochs_step += 1 average_loss = sum(avg_loss) / len(avg_loss) average_acc = sum(avg_acc) / len(avg_acc) logger.info("train policy dl dist network, epochs=%d, average_loss=%.7f, average_acc=%.7f" % (epochs_step, average_loss, average_acc)) # Save the model checkpoint periodically. if step % num_batchs_per_epochs == 0 and epochs_step % 20 == 0: param_serierlize(param_file, {"epoch": int(epochs_step), "global_step": int(global_step_val)}) filename = save_model(sess, train_dir, saver, "policy_dl_epoch_%d" % epochs_step, global_step=global_step_val) logger.info("save policy dl dist model: %s" % filename)
def train_policy_network_rl_distribute(args): policy_planes = args.policy_planes value_planes = args.value_planes # hosts ps_hosts = args.ps_hosts.split(",") worker_hosts = args.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=args.job_name, task_index=args.task_index) if args.job_name == "ps": server.join() elif args.job_name == "worker": if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state( args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork( policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=True, ) init_op = tf.initialize_all_variables() summary_op = tf.merge_all_summaries() sv = tf.train.Supervisor(is_chief=(args.task_index == 0), logdir=policy_rl.model_dir, init_op=init_op, summary_op=summary_op, saver=policy_rl.saver, global_step=policy_rl.global_step, save_model_secs=0) sess = sv.prepare_or_wait_for_session(server.target, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) sess.run(init_op) # Start queue runners for the input pipelines (if any). sv.start_queue_runners(sess) policy_rl.set_session(sess) if model_file is not None: policy_rl.saver.restore(sess, model_file) logger.info("load model file: %s" % model_file) else: policy_rl.restore_model() # load value network if args.policy_rl_phase > 1: value_dl = ValueNetwork( value_planes, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, gpu=args.values_net_gpu, optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: value_dl = None # train policy rl policy_rl.train_policy_network(value_dl, epochs=args.policy_rl_epochs, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def load_model(args, model_type, model_file=None): policy_planes = args.policy_planes value_planes = args.value_planes pattern_features = args.pattern_features if model_type == "policy_dl": model = PolicyDLNetwork( policy_planes, corpus, args, filters=args.policy_dl_filters, board_size=args.board_size, model_dir=args.policy_dl_models_dir, device="gpu", gpu=args.policy_dl_gpu, optimizer=args.policy_dl_optimizer, learn_rate=args.policy_dl_learn_rate, distributed_train=False, ) elif model_type == "policy_rollout": model = PolicyRolloutModel( policy_planes, patterns, args, board_size=args.board_size, model_dir=args.policy_rollout_models_dir, device="cpu", optimizer=args.policy_rollout_optimizer, learn_rate=args.policy_rollout_learn_rate, distributed_train=False, ) elif model_type == "policy_rl": model = PolicyRLNetwork( policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, device="cpu", optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) elif model_type == "value_net": model = ValueNetwork( value_planes, args, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, device="cpu", optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: logger.error("unsupported model type=%s" % model_type, to_exit=True) # init session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options)) session.run(tf.initialize_all_variables()) model.set_session(session) # restore model status = model.restore_model(model_file=model_file) if not status and model_type == "policy_rl": checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) model_file = checkpoint.model_checkpoint_path logger.info("successful load model file: %s" % model_file) model.saver.restore(session, model_file) return model
def train_policy_network_rl_distribute(args): policy_planes = args.policy_planes value_planes = args.value_planes # hosts ps_hosts = args.ps_hosts.split(",") worker_hosts = args.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=args.job_name, task_index=args.task_index) if args.job_name == "ps": server.join() elif args.job_name == "worker": if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=True, ) init_op = tf.initialize_all_variables() summary_op = tf.merge_all_summaries() sv = tf.train.Supervisor(is_chief=(args.task_index == 0), logdir=policy_rl.model_dir, init_op=init_op, summary_op=summary_op, saver=policy_rl.saver, global_step=policy_rl.global_step, save_model_secs=0) sess = sv.prepare_or_wait_for_session(server.target, config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) ) sess.run(init_op) # Start queue runners for the input pipelines (if any). sv.start_queue_runners(sess) policy_rl.set_session(sess) if model_file is not None: policy_rl.saver.restore(sess, model_file) logger.info("load model file: %s" % model_file) else: policy_rl.restore_model() # load value network if args.policy_rl_phase > 1: value_dl = ValueNetwork(value_planes, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, gpu=args.values_net_gpu, optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: value_dl = None # train policy rl policy_rl.train_policy_network(value_dl, epochs=args.policy_rl_epochs, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def train_rl_network(batch_games=128, save_step=10000, max_model_pools=5, init_epsilon=0.5, final_epsilon=0.01, explore=1000000, action_repeat=32, mini_batch_size=64): """ data set from self-play :return: """ args = parser_argument().parse_args() rpc = ModelRPC(args) game = RenjuGame() batch_states, batch_actions, batch_rewards = deque(), deque(), deque() mini_batch_states, mini_batch_actions, mini_batch_rewards = [ 0 ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size model_pools = [] param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={ "global_step": 0, "epsilon": init_epsilon }) global_step_val, epsilon = params["global_step"], params["epsilon"] # load model sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network( ) train_step = 0 while True: start_time = time.time() # choose policy network for opponent player from model pools if train_step % 10 == 0: if len(model_pools) > 0: model_file = random.choice(model_pools) else: model_file = None rpc.switch_model("policy_rl", model_file=model_file) while len(batch_states) < batch_games: # opponent_policy = self.load_history_policy_model(model_file) black_opponent = random.choice([True, False]) # reset game game.reset_game() # simulate game by current parameter states, actions, rewards = [], [], [] state = game.step_games(None) while True: # loop current game # self-play, current model V.S. history model if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \ or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE): predict_probs = rpc.policy_rl_rpc( board_to_stream(game.board), game.get_player_name()) else: # current player predict_probs = sess.run( [tower_logits[0]], feed_dict={tower_feeds[0][0]: [state]})[0][0] if random.random() < epsilon: # random choose action action = game.weighted_choose_action(predict_probs) else: action = game.choose_action(predict_probs) if action is None: final_reward = 0 break # step game state_n, reward_n, terminal_n = game.step_games(action) # store (state, action) states.append(state) actions.append(action) # set new states state = state_n if terminal_n: final_reward = reward_n break # check whether game drawn if game.random_action( ) is None: # game drawn, equal end, reward=0 final_reward = 0 logger.info("game drawn, so amazing...") break # store (reward) for step in xrange(len(states)): if step % 2 == 0: rewards.append(final_reward) else: rewards.append(-final_reward) # store states of ith game batch_states.append(states) batch_actions.append(actions) batch_rewards.append(rewards) # fit model by mini batch avg_loss, avg_acc = 0.0, 0.0 for _ in xrange(action_repeat / gpu_num): train_step += 1 feeds = {} for gpu_id in xrange(gpu_num): for idx in xrange(mini_batch_size): game_idx = random.randint(0, len(batch_states) - 1) game_time_step_idx = random.randint( 0, len(batch_states[game_idx]) - 1) mini_batch_states[idx] = batch_states[game_idx][ game_time_step_idx] mini_batch_actions[idx] = batch_actions[game_idx][ game_time_step_idx] mini_batch_rewards[idx] = batch_rewards[game_idx][ game_time_step_idx] feeds[tower_feeds[gpu_id][0]] = mini_batch_states feeds[tower_feeds[gpu_id][1]] = mini_batch_actions feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards _, global_step_val, loss_val, acc_val = sess.run( [train_op, global_step, loss, accuracy], feed_dict=feeds) avg_loss += loss_val avg_acc += acc_val # update epsilon if epsilon > final_epsilon: epsilon -= (init_epsilon - final_epsilon) / explore avg_loss /= action_repeat avg_acc /= action_repeat batch_states.popleft() batch_actions.popleft() batch_rewards.popleft() global_step_val = int(global_step_val) elapsed_time = int(time.time() - start_time) logger.info( "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" % (train_step, epsilon, avg_loss, avg_acc, elapsed_time)) # save model if train_step % save_step == 0: params["global_step"], params["epsilon"] = global_step_val, epsilon param_serierlize(param_file, params) model_file = save_model(sess, train_dir, saver, "policy_rl_step_%d" % train_step, global_step=global_step_val) logger.info("save policy rl model, file=%s" % model_file) model_file = model_file[len(train_dir):] # add history model to pool model_pools.append(model_file) if len(model_pools ) > max_model_pools: # pop head when model pools exceed model_pools.pop(0) logger.info("model pools has files: [%s]" % (", ".join(model_pools)))
def train_rl_network(batch_games=128, save_step=10000, max_model_pools=5, init_epsilon=0.5, final_epsilon=0.01, explore=1000000, action_repeat=32, mini_batch_size=64): """ data set from self-play :return: """ args = parser_argument().parse_args() rpc = ModelRPC(args) game = RenjuGame() batch_states, batch_actions, batch_rewards = deque(), deque(), deque() mini_batch_states, mini_batch_actions, mini_batch_rewards = [0] * mini_batch_size, [0] * mini_batch_size, [ 0] * mini_batch_size model_pools = [] param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={"global_step": 0, "epsilon": init_epsilon}) global_step_val, epsilon = params["global_step"], params["epsilon"] # load model sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network() train_step = 0 while True: start_time = time.time() # choose policy network for opponent player from model pools if train_step % 10 == 0: if len(model_pools) > 0: model_file = random.choice(model_pools) else: model_file = None rpc.switch_model("policy_rl", model_file=model_file) while len(batch_states) < batch_games: # opponent_policy = self.load_history_policy_model(model_file) black_opponent = random.choice([True, False]) # reset game game.reset_game() # simulate game by current parameter states, actions, rewards = [], [], [] state = game.step_games(None) while True: # loop current game # self-play, current model V.S. history model if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \ or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE): predict_probs = rpc.policy_rl_rpc(board_to_stream(game.board), game.get_player_name()) else: # current player predict_probs = sess.run([tower_logits[0]], feed_dict={tower_feeds[0][0]: [state]})[0][0] if random.random() < epsilon: # random choose action action = game.weighted_choose_action(predict_probs) else: action = game.choose_action(predict_probs) if action is None: final_reward = 0 break # step game state_n, reward_n, terminal_n = game.step_games(action) # store (state, action) states.append(state) actions.append(action) # set new states state = state_n if terminal_n: final_reward = reward_n break # check whether game drawn if game.random_action() is None: # game drawn, equal end, reward=0 final_reward = 0 logger.info("game drawn, so amazing...") break # store (reward) for step in xrange(len(states)): if step % 2 == 0: rewards.append(final_reward) else: rewards.append(-final_reward) # store states of ith game batch_states.append(states) batch_actions.append(actions) batch_rewards.append(rewards) # fit model by mini batch avg_loss, avg_acc = 0.0, 0.0 for _ in xrange(action_repeat / gpu_num): train_step += 1 feeds = {} for gpu_id in xrange(gpu_num): for idx in xrange(mini_batch_size): game_idx = random.randint(0, len(batch_states) - 1) game_time_step_idx = random.randint(0, len(batch_states[game_idx]) - 1) mini_batch_states[idx] = batch_states[game_idx][game_time_step_idx] mini_batch_actions[idx] = batch_actions[game_idx][game_time_step_idx] mini_batch_rewards[idx] = batch_rewards[game_idx][game_time_step_idx] feeds[tower_feeds[gpu_id][0]] = mini_batch_states feeds[tower_feeds[gpu_id][1]] = mini_batch_actions feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards _, global_step_val, loss_val, acc_val = sess.run([train_op, global_step, loss, accuracy], feed_dict=feeds) avg_loss += loss_val avg_acc += acc_val # update epsilon if epsilon > final_epsilon: epsilon -= (init_epsilon - final_epsilon) / explore avg_loss /= action_repeat avg_acc /= action_repeat batch_states.popleft() batch_actions.popleft() batch_rewards.popleft() global_step_val = int(global_step_val) elapsed_time = int(time.time() - start_time) logger.info( "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" % (train_step, epsilon, avg_loss, avg_acc, elapsed_time)) # save model if train_step % save_step == 0: params["global_step"], params["epsilon"] = global_step_val, epsilon param_serierlize(param_file, params) model_file = save_model(sess, train_dir, saver, "policy_rl_step_%d" % train_step, global_step=global_step_val) logger.info("save policy rl model, file=%s" % model_file) model_file = model_file[len(train_dir):] # add history model to pool model_pools.append(model_file) if len(model_pools) > max_model_pools: # pop head when model pools exceed model_pools.pop(0) logger.info("model pools has files: [%s]" % (", ".join(model_pools)))
def train(epochs=200, predict=False): param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={"epoch": 0, "global_step": 0}) global_epoch, global_step_val = int(params["epoch"]), int(params["global_step"]) """Train for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(global_step_val), trainable=False) # Calculate the learning rate schedule. num_batchs_per_epochs = int(corpus.num_batchs_per_epochs(BATCH_SIZE)) print("num_batches_per_epoch: %d" % num_batchs_per_epochs) decay_steps = int(num_batchs_per_epochs / gpu_num * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. # opt = tf.train.GradientDescentOptimizer(lr) opt = tf.train.AdamOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] tower_acc = [] tower_feeds = [] for i in xrange(gpu_num): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: # all towers. batch_input = tf.placeholder(tf.float32, [None, 15*15*planes]) batch_labels = tf.placeholder(tf.float32, shape=[None]) tower_feeds.append((batch_input, batch_labels)) loss = tower_loss(scope, batch_input, batch_labels) # all accuracy tower_acc.append(tf.get_collection('accuracy', scope)[0]) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # average accuracy accuracy = tf.add_n(tower_acc) / len(tower_acc) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. # saver = tf.train.Saver(tf.all_variables()) saver = tf.train.Saver() # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_MEMERY_ALLOCATE) sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) ) sess.run(init) # restore model restore_model(sess, train_dir, saver) if predict: return sess, saver # Start the queue runners. tf.train.start_queue_runners(sess=sess) graph_def = sess.graph.as_graph_def(add_shapes=True) summary_writer = tf.train.SummaryWriter(train_dir, graph_def=graph_def) avg_loss, avg_acc = [], [] epochs_step = global_epoch + 1 step = 0 while epochs_step <= (global_epoch + epochs): step += gpu_num start_time = time.time() # _, loss_value, acc_value, global_step_val = sess.run([train_op, loss, accuracy, global_step]) feeds = {} for idx in xrange(gpu_num): samples = corpus.next_fetch_rows(BATCH_SIZE) feature = np.array([sample[0].get_states(flatten=True) for sample in samples], dtype=np.float32) labels = np.array([sample[1] for sample in samples], dtype=np.float32) feeds[tower_feeds[idx][0]] = feature feeds[tower_feeds[idx][1]] = labels _, loss_value, acc_value, global_step_val, learn_rating = sess.run( [train_op, loss, accuracy, global_step, lr], feed_dict=feeds) elapsed_time = int((time.time() - start_time) * 1000) avg_loss.append(loss_value) avg_acc.append(acc_value) global_step_val = int(global_step_val) if global_step_val % 10 == 0: logger.info( "train policy rollout multi_GPU network, epoch=%d, step=%d, loss=%.6f, acc=%.6f, lr=%.6f, time=%d(ms)" % ( epochs_step, step, loss_value, acc_value, learn_rating, elapsed_time)) # if global_step_val % 100 == 0: # summary_str = sess.run(summary_op) # summary_writer.add_summary(summary_str, step) if step > num_batchs_per_epochs: step = step % num_batchs_per_epochs epochs_step += 1 average_loss = sum(avg_loss) / len(avg_loss) average_acc = sum(avg_acc) / len(avg_acc) avg_loss, avg_acc = [], [] logger.info("train policy rollout multi_GPU network, epochs=%d, average_loss=%.7f, average_acc=%.7f" % (epochs_step, average_loss, average_acc)) # Save the model checkpoint periodically. if epochs_step % 5 == 0: param_serierlize(param_file, {"epoch": int(epochs_step), "global_step": int(global_step_val)}) filename = save_model(sess, train_dir, saver, "policy_rollout_epoch_%d" % epochs_step, global_step=global_step_val) logger.info("save policy rollout multi_GPU model: %s" % filename)
def train(epochs=200): param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={ "epoch": 0, "global_step": 0 }) global_epoch, global_step_val = int(params["epoch"]), int( params["global_step"]) """Train for a number of steps.""" with tf.Graph().as_default(), tf.device('/job:ps/task:0/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(global_step_val), trainable=False) # Calculate the learning rate schedule. num_batchs_per_epochs = corpus.num_batchs_per_epochs(BATCH_SIZE) print("num_batches_per_epoch: %d" % num_batchs_per_epochs) decay_steps = int(num_batchs_per_epochs * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] tower_acc = [] for i in xrange(len(CLUSTER_CONFIG["worker_hosts"])): gpu_device = CLUSTER_CONFIG["worker_hosts"][i][1] with tf.device('/job:worker/task:%d/%s' % (i, gpu_device)): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: # all towers. loss = tower_loss(scope, CLUSTER_CONFIG["worker_hosts"][i][2]) # all accuracy tower_acc.append(tf.get_collection('accuracy', scope)[0]) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # average accuracy accuracy = tf.add_n(tower_acc) / len(tower_acc) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) sess = tf.Session("grpc://" + CLUSTER_CONFIG["worker_hosts"][0][0], config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options)) sess.run(init) # restore model restore_model(sess, train_dir, saver) # Start the queue runners. tf.train.start_queue_runners(sess=sess) graph_def = sess.graph.as_graph_def(add_shapes=True) summary_writer = tf.train.SummaryWriter(train_dir, graph_def=graph_def) avg_loss, avg_acc = [0] * num_batchs_per_epochs, [ 0 ] * num_batchs_per_epochs epochs_step = global_epoch + 1 step = 0 while epochs_step <= (global_epoch + epochs): step += 1 start_time = time.time() _, loss_value, acc_value, global_step_val = sess.run( [train_op, loss, accuracy, global_step]) elapsed_time = int((time.time() - start_time) * 1000) avg_loss[step % num_batchs_per_epochs] = loss_value avg_acc[step % num_batchs_per_epochs] = acc_value global_step_val = int(global_step_val) if global_step_val % 2 == 0: logger.info( "train policy dl dist network, epoch=%d, step=%d, loss=%.6f, acc=%.6f, time=%d(ms)" % (epochs_step, step, loss_value, acc_value, elapsed_time)) if global_step_val % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step > num_batchs_per_epochs: step = step % num_batchs_per_epochs epochs_step += 1 average_loss = sum(avg_loss) / len(avg_loss) average_acc = sum(avg_acc) / len(avg_acc) logger.info( "train policy dl dist network, epochs=%d, average_loss=%.7f, average_acc=%.7f" % (epochs_step, average_loss, average_acc)) # Save the model checkpoint periodically. if step % num_batchs_per_epochs == 0 and epochs_step % 20 == 0: param_serierlize(param_file, { "epoch": int(epochs_step), "global_step": int(global_step_val) }) filename = save_model(sess, train_dir, saver, "policy_dl_epoch_%d" % epochs_step, global_step=global_step_val) logger.info("save policy dl dist model: %s" % filename)