Exemple #1
0
 def train_value_network(self, rpc, sample_num=1000, max_time_steps=225,
                         epochs=20, batch_size=32):
     """
     :param policy_dl: policy network of deep learning
     :param policy_rl: policy network of reinforcement learning
     :return:
     """
     model_params = self.param_unserierlize(init_params={"global_step": 0, "global_epoch": 0})
     if sample_num > 0:  # create sample
         start_time = time.time()
         sample_file = "data/value_net_phase_%d_samples_%d.pkl" % (self.phase, sample_num)
         sample_games = sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=max_time_steps)
         elapsed_time = int((time.time() - start_time) * 1000)
         logger.info("sampling for value network, samples=%d, time=%d(ms)" % (sample_num, elapsed_time))
         cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
         logger.info("save sample file: %s" % sample_file)
         model_params["sample_file"] = sample_file
         self.param_serierlize(model_params)
     else:  # load old sample
         if 'sample_file' not in model_params:
             logger.error("not found sample file", to_exit=True)
         sample_games = cPickle.load(open(model_params["sample_file"], 'rb'))
     epoch_step, train_step = model_params["global_epoch"], model_params["global_step"]
     while epoch_step < (model_params["global_epoch"] + epochs):
         start_time = time.time()
         epoch_step += 1
         random.shuffle(sample_games)
         avg_loss = 0.0
         for idx in xrange(0, len(sample_games), batch_size):
             end_idx = min(len(sample_games), idx + batch_size)
             mini_samples = sample_games[idx: end_idx]
             # transform sample data
             mini_states = [sampled_game.get_states(player_plane=True) for sampled_game, _ in mini_samples]
             mini_rewards = [sampled_reward for _, sampled_reward in mini_samples]
             fetch_status = self.fit(mini_states, mini_rewards, fetch_info=True)
             _, train_step, loss = fetch_status
             avg_loss += loss
             train_step = int(train_step)
             if train_step % 20 == 0:
                 elapsed_time = int((time.time() - start_time) * 1000)
                 logger.info(
                     "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)" %
                     (self.phase, epoch_step, train_step, loss, elapsed_time))
                 start_time = time.time()
         avg_loss /= math.ceil(len(sample_games) / batch_size)
         logger.info("train value network, phase=%d, epoch=%d, avg_loss=%.6f" % (self.phase, epoch_step, avg_loss))
         if epoch_step % 5 == 0:  # save model
             model_params["global_step"] = train_step
             model_params["global_epoch"] = epoch_step
             self.param_serierlize(model_params)
             model_file = self.save_model("value_net_phase_%d" % self.phase, global_step=model_params["global_step"])
             logger.info("save value network model, file=%s" % model_file)
Exemple #2
0
def load_model(args, model_type, model_file=None):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    pattern_features = args.pattern_features
    if model_type == "policy_dl":
        model = PolicyDLNetwork(policy_planes, corpus, args, filters=args.policy_dl_filters,
                                board_size=args.board_size,
                                model_dir=args.policy_dl_models_dir, device="gpu", gpu=args.policy_dl_gpu,
                                optimizer=args.policy_dl_optimizer,
                                learn_rate=args.policy_dl_learn_rate,
                                distributed_train=False,
                                )
    elif model_type == "policy_rollout":
        model = PolicyRolloutModel(policy_planes, patterns, args,
                                   board_size=args.board_size,
                                   model_dir=args.policy_rollout_models_dir, device="cpu",
                                   optimizer=args.policy_rollout_optimizer,
                                   learn_rate=args.policy_rollout_learn_rate,
                                   distributed_train=False,
                                   )
    elif model_type == "policy_rl":
        model = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters,
                                board_size=args.board_size,
                                model_dir=args.policy_rl_models_dir, device="cpu",
                                optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate,
                                distributed_train=False,
                                )
    elif model_type == "value_net":
        model = ValueNetwork(value_planes, args, phase=args.values_net_phase, filters=args.values_net_filters,
                             board_size=args.board_size,
                             model_dir=args.values_net_models_dir, device="cpu",
                             optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate,
                             )
    else:
        logger.error("unsupported model type=%s" % model_type, to_exit=True)
    # init session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                               allow_soft_placement=True,
                                               gpu_options=gpu_options))
    session.run(tf.initialize_all_variables())
    model.set_session(session)
    # restore model
    status = model.restore_model(model_file=model_file)
    if not status and model_type == "policy_rl":
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        model_file = checkpoint.model_checkpoint_path
        logger.info("successful load model file: %s" % model_file)
        model.saver.restore(session, model_file)
    return model
Exemple #3
0
def train_policy_network_rl(args):
    policy_planes = args.policy_planes
    # rpc of value_net
    rpc = ModelRPC(args)
    if args.policy_rl_reset:
        # empty old rl policy network
        if os.path.exists(args.policy_rl_models_dir):
            # os.removedirs(args.policy_rl_models_dir)
            shutil.rmtree(args.policy_rl_models_dir)
        os.makedirs(args.policy_rl_models_dir)
        # read parameters from DL policy network
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            model_file = checkpoint.model_checkpoint_path
        else:
            logger.error("not found policy dl model avaliable", to_exit=True)
    else:
        model_file = None
    # init policy RL network
    policy_rl = PolicyRLNetwork(
        policy_planes,
        args,
        phase=args.policy_rl_phase,
        filters=args.policy_rl_filters,
        board_size=args.board_size,
        model_dir=args.policy_rl_models_dir,
        gpu=args.policy_rl_gpu,
        optimizer=args.policy_rl_optimizer,
        learn_rate=args.policy_rl_learn_rate,
        distributed_train=False,
    )
    # init session
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                               allow_soft_placement=True))
    session.run(tf.initialize_all_variables())
    policy_rl.set_session(session)
    # restore model if exist
    if model_file is not None:
        policy_rl.saver.restore(session, model_file)
        logger.info("load model file: %s" % model_file)
        policy_rl.save_model("policy_rl", global_step=0)
    else:
        policy_rl.restore_model()
    # train policy rl
    policy_rl.train_policy_network(rpc,
                                   batch_games=args.policy_rl_batch_games,
                                   save_step=args.policy_rl_save_step)
Exemple #4
0
 def train_policy_network(self,
                          corpus,
                          epochs=20,
                          batch_size=64,
                          save_step=5):
     """
     :param states: [array(15, 15, planes)]
     :param actions:  [one_hot_list(0~225),]
     :return:
     """
     start_time = time.time()
     params = self.param_unserierlize(init_params={
         "epoch": 0,
         "global_step": 0
     })
     global_epoch, global_step = int(params["epoch"]), int(
         params["global_step"])
     epochs_step = global_epoch
     while epochs_step < (global_epoch + epochs):
         epochs_step += 1
         average_loss = 0.0
         average_acc = 0.0
         local_step = 0
         corpus.shuffle_datas()
         elapsed_time = 0.0
         for samples in corpus.iterator_fetch_rows(batch_size):
             sample_states = [sample[0].get_states() for sample in samples]
             sample_actions = [
                 one_hot_action(sample[1]) for sample in samples
             ]
             start_time = time.time()
             fetch_status = self.fit(sample_states,
                                     sample_actions,
                                     fetch_info=True)
             elapsed_time += int((time.time() - start_time) * 1000)
             _, global_step, loss, acc, lr = fetch_status
             # record loss
             local_step += 1
             average_loss += loss
             average_acc += acc
             # record time
             if global_step % 8 == 0:
                 logger.info(
                     "train policy dl network, epochs=%d, global_step=%d, loss=%.7f, avg_loss=%.7f, acc=%.7f, avg_acc=%.7f, lr=%.7f, time=%d(ms)"
                     % (epochs_step, global_step, loss,
                        average_loss / local_step, acc,
                        average_acc / local_step, lr, elapsed_time))
                 elapsed_time = 0
         logger.info(
             "train policy dl network, epochs=%d, average_loss=%.7f, average_acc=%.7f"
             % (epochs_step, average_loss / local_step,
                average_acc / local_step))
         if epochs_step % save_step == 0:  # save model
             self.param_serierlize({
                 "epoch": int(epochs_step),
                 "global_step": int(global_step)
             })
             filename = self.save_model("policy_dl_epoch_%d" % epochs_step,
                                        global_step=global_step)
             logger.info("save policy dl model: %s" % filename)
Exemple #5
0
def train_policy_network_rl(args):
    policy_planes = args.policy_planes
    # rpc of value_net
    rpc = ModelRPC(args)
    if args.policy_rl_reset:
        # empty old rl policy network
        if os.path.exists(args.policy_rl_models_dir):
            # os.removedirs(args.policy_rl_models_dir)
            shutil.rmtree(args.policy_rl_models_dir)
        os.makedirs(args.policy_rl_models_dir)
        # read parameters from DL policy network
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            model_file = checkpoint.model_checkpoint_path
        else:
            logger.error("not found policy dl model avaliable", to_exit=True)
    else:
        model_file = None
    # init policy RL network
    policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters,
                                board_size=args.board_size,
                                model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu,
                                optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate,
                                distributed_train=False,
                                )
    # init session
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True))
    session.run(tf.initialize_all_variables())
    policy_rl.set_session(session)
    # restore model if exist
    if model_file is not None:
        policy_rl.saver.restore(session, model_file)
        logger.info("load model file: %s" % model_file)
        policy_rl.save_model("policy_rl", global_step=0)
    else:
        policy_rl.restore_model()
    # train policy rl
    policy_rl.train_policy_network(rpc,
                                   batch_games=args.policy_rl_batch_games,
                                   save_step=args.policy_rl_save_step)
Exemple #6
0
 def import_RenjuNet(self, file_path):
     if not os.path.exists(file_path):
         logger.error("not found file: %s" % file_path, to_exit=True)
     # read xml file
     bs_tree = BeautifulSoup(open(file_path, 'r').read())
     games = bs_tree.find_all("game")
     # insert moves
     game_num = len(games)
     move_count = 0
     step = 0
     for game in games:
         step += 1
         gid = int(game.attrs["id"])
         moves = game.move.text.strip().replace("%20", " ").split(" ")
         if len(self.db.query("select id from renju WHERE gid=?", gid)) > 0:  # when gid exists
             continue
         renju_game = RenjuGame()
         for mid, move in enumerate(moves):
             move = move.strip()
             if move == "":
                 continue
             board_stream = board_to_stream(renju_game.board)
             player = renju_game.player
             row = ord(move[0]) - ord('a')
             col = int(move[1:]) - 1
             action = renju_game.transform_action((row, col))
             # insert
             self.db.execute("insert INTO renju (gid, mid, board, player, action) VALUES (?, ?, ?, ?, ?)",
                             gid, mid, board_stream, player, action)
             # do move
             renju_game.do_move((row, col))
         move_count += len(moves)
         if step % 100 == 0:
             print "load games= %d / %d" % (step, game_num)
     logger.info("newly insert games=%d, moves=%d" % (game_num, move_count))
     print "finish import moves"
Exemple #7
0
 def train_policy_network(self, corpus, epochs=20, batch_size=64, save_step=5):
     """
     :param states: [array(15, 15, planes)]
     :param actions:  [one_hot_list(0~225),]
     :return:
     """
     start_time = time.time()
     params = self.param_unserierlize(init_params={"epoch": 0, "global_step": 0})
     global_epoch, global_step = int(params["epoch"]), int(params["global_step"])
     epochs_step = global_epoch
     while epochs_step < (global_epoch + epochs):
         epochs_step += 1
         average_loss = 0.0
         average_acc = 0.0
         local_step = 0
         corpus.shuffle_datas()
         elapsed_time = 0.0
         for samples in corpus.iterator_fetch_rows(batch_size):
             sample_states = [sample[0].get_states() for sample in samples]
             sample_actions = [one_hot_action(sample[1]) for sample in samples]
             start_time = time.time()
             fetch_status = self.fit(sample_states, sample_actions, fetch_info=True)
             elapsed_time += int((time.time() - start_time) * 1000)
             _, global_step, loss, acc, lr = fetch_status
             # record loss
             local_step += 1
             average_loss += loss
             average_acc += acc
             # record time
             if global_step % 8 == 0:
                 logger.info("train policy dl network, epochs=%d, global_step=%d, loss=%.7f, avg_loss=%.7f, acc=%.7f, avg_acc=%.7f, lr=%.7f, time=%d(ms)" %
                         (epochs_step, global_step, loss, average_loss / local_step, acc, average_acc / local_step, lr, elapsed_time))
                 elapsed_time = 0
         logger.info("train policy dl network, epochs=%d, average_loss=%.7f, average_acc=%.7f" %
                     (epochs_step, average_loss / local_step, average_acc / local_step))
         if epochs_step % save_step == 0:  # save model
             self.param_serierlize({"epoch": int(epochs_step), "global_step": int(global_step)})
             filename = self.save_model("policy_dl_epoch_%d" % epochs_step, global_step=global_step)
             logger.info("save policy dl model: %s" % filename)
Exemple #8
0
def sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=225):
    """
    :param max_steps: max time steps in games
    :return:
    """
    sample_games = []
    if os.path.exists(sample_file):
        sample_games = cPickle.load(open(sample_file, 'rb'))
        logger.info("load sample file: %s, samples=%d" % (sample_file, len(sample_games)))
    sample_sets = set()  # used to check unique sample
    game = RenjuGame()
    record_policy_dl_boards = []
    # move step by policy dl
    game.reset_game()
    record_policy_dl_boards.append(game.replicate_game())
    while True:
        action = game.choose_action(
            rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name()))
        if action is None:
            break
        state, _, terminal = game.step_games(action)
        if terminal:
            break
        record_policy_dl_boards.append(game.replicate_game())
    max_time_steps = min(max_time_steps, len(record_policy_dl_boards)) - 1
    # sample game
    while len(sample_games) < sample_num:
        sampled_game = None
        while True:  # loop to find legal sample
            flag_time_step = random.randint(1, max_time_steps)
            recorded_game = record_policy_dl_boards[flag_time_step - 1].replicate_game()
            random_action = recorded_game.random_action()
            if random_action is None:
                break
            random_state, _, terminal = recorded_game.step_games(random_action)
            if not terminal and not str(random_state) in sample_sets:
                sample_sets.add(str(random_state))
                break
        if random_action is None:  # invalid loop
            continue
        # move step by policy rl
        time_step = flag_time_step
        while True:  # simulate game by policy rl
            actions = rpc.policy_rl_rpc(board_to_stream(recorded_game.board), recorded_game.get_player_name())
            action = recorded_game.choose_action(actions)
            if action is None:  # game drawn
                sampled_reward = 0
                break
            state, reward, terminal = recorded_game.step_games(action)
            time_step += 1
            if time_step == (flag_time_step + 1):  # record board
                sampled_game = recorded_game.replicate_game()
            if terminal:  # record value
                sampled_reward = reward
                break
        if sampled_game is not None:
            sample_games.append((sampled_game, sampled_reward))
            logger.info("sample simulate, sample_step=%d, time_step=%d" % (len(sample_games), time_step))
        if len(sample_games) % 100 == 0:
            cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
            logger.info("create value network sample, step=%d" % len(sample_games))
    return sample_games
Exemple #9
0
def sampling_for_value_network(rpc,
                               sample_num,
                               sample_file,
                               max_time_steps=225):
    """
    :param max_steps: max time steps in games
    :return:
    """
    sample_games = []
    if os.path.exists(sample_file):
        sample_games = cPickle.load(open(sample_file, 'rb'))
        logger.info("load sample file: %s, samples=%d" %
                    (sample_file, len(sample_games)))
    sample_sets = set()  # used to check unique sample
    game = RenjuGame()
    record_policy_dl_boards = []
    # move step by policy dl
    game.reset_game()
    record_policy_dl_boards.append(game.replicate_game())
    while True:
        action = game.choose_action(
            rpc.policy_dl_rpc(board_to_stream(game.board),
                              game.get_player_name()))
        if action is None:
            break
        state, _, terminal = game.step_games(action)
        if terminal:
            break
        record_policy_dl_boards.append(game.replicate_game())
    max_time_steps = min(max_time_steps, len(record_policy_dl_boards)) - 1
    # sample game
    while len(sample_games) < sample_num:
        sampled_game = None
        while True:  # loop to find legal sample
            flag_time_step = random.randint(1, max_time_steps)
            recorded_game = record_policy_dl_boards[flag_time_step -
                                                    1].replicate_game()
            random_action = recorded_game.random_action()
            if random_action is None:
                break
            random_state, _, terminal = recorded_game.step_games(random_action)
            if not terminal and not str(random_state) in sample_sets:
                sample_sets.add(str(random_state))
                break
        if random_action is None:  # invalid loop
            continue
        # move step by policy rl
        time_step = flag_time_step
        while True:  # simulate game by policy rl
            actions = rpc.policy_rl_rpc(board_to_stream(recorded_game.board),
                                        recorded_game.get_player_name())
            action = recorded_game.choose_action(actions)
            if action is None:  # game drawn
                sampled_reward = 0
                break
            state, reward, terminal = recorded_game.step_games(action)
            time_step += 1
            if time_step == (flag_time_step + 1):  # record board
                sampled_game = recorded_game.replicate_game()
            if terminal:  # record value
                sampled_reward = reward
                break
        if sampled_game is not None:
            sample_games.append((sampled_game, sampled_reward))
            logger.info("sample simulate, sample_step=%d, time_step=%d" %
                        (len(sample_games), time_step))
        if len(sample_games) % 100 == 0:
            cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
            logger.info("create value network sample, step=%d" %
                        len(sample_games))
    return sample_games
Exemple #10
0
 def train_value_network(self,
                         rpc,
                         sample_num=1000,
                         max_time_steps=225,
                         epochs=20,
                         batch_size=32):
     """
     :param policy_dl: policy network of deep learning
     :param policy_rl: policy network of reinforcement learning
     :return:
     """
     model_params = self.param_unserierlize(init_params={
         "global_step": 0,
         "global_epoch": 0
     })
     if sample_num > 0:  # create sample
         start_time = time.time()
         sample_file = "data/value_net_phase_%d_samples_%d.pkl" % (
             self.phase, sample_num)
         sample_games = sampling_for_value_network(
             rpc, sample_num, sample_file, max_time_steps=max_time_steps)
         elapsed_time = int((time.time() - start_time) * 1000)
         logger.info("sampling for value network, samples=%d, time=%d(ms)" %
                     (sample_num, elapsed_time))
         cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
         logger.info("save sample file: %s" % sample_file)
         model_params["sample_file"] = sample_file
         self.param_serierlize(model_params)
     else:  # load old sample
         if 'sample_file' not in model_params:
             logger.error("not found sample file", to_exit=True)
         sample_games = cPickle.load(open(model_params["sample_file"],
                                          'rb'))
     epoch_step, train_step = model_params["global_epoch"], model_params[
         "global_step"]
     while epoch_step < (model_params["global_epoch"] + epochs):
         start_time = time.time()
         epoch_step += 1
         random.shuffle(sample_games)
         avg_loss = 0.0
         for idx in xrange(0, len(sample_games), batch_size):
             end_idx = min(len(sample_games), idx + batch_size)
             mini_samples = sample_games[idx:end_idx]
             # transform sample data
             mini_states = [
                 sampled_game.get_states(player_plane=True)
                 for sampled_game, _ in mini_samples
             ]
             mini_rewards = [
                 sampled_reward for _, sampled_reward in mini_samples
             ]
             fetch_status = self.fit(mini_states,
                                     mini_rewards,
                                     fetch_info=True)
             _, train_step, loss = fetch_status
             avg_loss += loss
             train_step = int(train_step)
             if train_step % 20 == 0:
                 elapsed_time = int((time.time() - start_time) * 1000)
                 logger.info(
                     "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)"
                     % (self.phase, epoch_step, train_step, loss,
                        elapsed_time))
                 start_time = time.time()
         avg_loss /= math.ceil(len(sample_games) / batch_size)
         logger.info(
             "train value network, phase=%d, epoch=%d, avg_loss=%.6f" %
             (self.phase, epoch_step, avg_loss))
         if epoch_step % 5 == 0:  # save model
             model_params["global_step"] = train_step
             model_params["global_epoch"] = epoch_step
             self.param_serierlize(model_params)
             model_file = self.save_model(
                 "value_net_phase_%d" % self.phase,
                 global_step=model_params["global_step"])
             logger.info("save value network model, file=%s" % model_file)
Exemple #11
0
    def train_policy_network(self,
                             rpc,
                             batch_games=128,
                             save_step=50000,
                             max_model_pools=5,
                             init_epsilon=0.5,
                             final_epsilon=0.05,
                             explore=1000000,
                             action_repeat=20,
                             mini_batch_size=128):
        """
            data set from self-play
        :return:
        """
        game = RenjuGame()
        batch_states, batch_actions, batch_rewards = deque(), deque(), deque()
        mini_batch_states, mini_batch_actions, mini_batch_rewards = [
            0
        ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size
        model_pools = []
        params = self.param_unserierlize(init_params={
            "global_step": 0,
            "epsilon": init_epsilon
        })
        global_step_val, epsilon = params["global_step"], params["epsilon"]
        train_step = 0
        while True:
            start_time = time.time()
            # choose policy network for opponent player from model pools
            if train_step % 10 == 0:
                if len(model_pools) > 0:
                    model_file = random.choice(model_pools)
                else:
                    model_file = None
                rpc.switch_model("policy_rl", model_file=model_file)
            while len(batch_states) < batch_games:
                # opponent_policy = self.load_history_policy_model(model_file)
                black_opponent = random.choice([True, False])
                # reset game
                game.reset_game()
                # simulate game by current parameter
                states, actions, rewards = [], [], []
                state = game.step_games(None)
                while True:  # loop current game
                    # self-play, current model V.S. history model
                    if random.random() < epsilon:  # random choose action
                        action = game.random_action()
                    else:
                        if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \
                                or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE):
                            action = game.choose_action(
                                rpc.policy_rl_rpc(board_to_stream(game.board),
                                                  game.get_player_name()))
                        else:  # current player
                            action = game.choose_action(
                                self.predict([state])[0])
                    # step game
                    state_n, reward_n, terminal_n = game.step_games(action)
                    # print "game=", batch_step, ", move=", transform_action(action)
                    # store (state, action)
                    states.append(state)
                    one_hot_act = one_hot_action(action)
                    actions.append(one_hot_act)
                    # set new states
                    state = state_n
                    if terminal_n:
                        final_reward = reward_n
                        # logger.info("winner=%s" % ("black" if reward_n > 0 else "white"))
                        break
                    # check whether game drawn
                    if game.random_action(
                    ) is None:  # game drawn, equal end, reward=0
                        final_reward = 0
                        logger.info("game drawn, so amazing...")
                        break
                # store (reward)
                for step in xrange(len(states)):
                    if step % 2 == 0:
                        rewards.append(final_reward)
                    else:
                        rewards.append(-final_reward)
                # store states of ith game
                batch_states.append(states)
                batch_actions.append(actions)
                batch_rewards.append(rewards)
            # fit model by mini batch
            avg_loss, avg_acc = 0.0, 0.0
            for _ in xrange(action_repeat):
                train_step += 1
                for idx in xrange(mini_batch_size):
                    game_idx = random.randint(0, len(batch_states) - 1)
                    game_time_step_idx = random.randint(
                        0,
                        len(batch_states[game_idx]) - 1)
                    mini_batch_states[idx] = batch_states[game_idx][
                        game_time_step_idx]
                    mini_batch_actions[idx] = batch_actions[game_idx][
                        game_time_step_idx]
                    mini_batch_rewards[idx] = batch_rewards[game_idx][
                        game_time_step_idx]
                _, global_step_val, loss, acc = self.fit(mini_batch_states,
                                                         mini_batch_actions,
                                                         mini_batch_rewards,
                                                         fetch_info=True)
                avg_loss += loss
                avg_acc += acc
                # update epsilon
                if epsilon > final_epsilon:
                    epsilon -= (init_epsilon - final_epsilon) / explore
            avg_loss /= action_repeat
            avg_acc /= action_repeat
            batch_states.popleft()
            batch_actions.popleft()
            batch_rewards.popleft()

            global_step_val = int(global_step_val)
            elapsed_time = int(time.time() - start_time)
            logger.info(
                "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)"
                % (global_step_val, epsilon, avg_loss, avg_acc, elapsed_time))
            # save model
            if train_step % save_step == 0:
                params["global_step"], params[
                    "epsilon"] = global_step_val, epsilon
                self.param_serierlize(params)
                model_file = self.save_model("policy_rl",
                                             global_step=global_step_val)
                logger.info("save policy dl model, file=%s" % model_file)
                model_file = model_file[len(self.model_dir):]
                # add history model to pool
                model_pools.append(model_file)
                if len(model_pools
                       ) > max_model_pools:  # pop head when model pools exceed
                    model_pools.pop(0)
                logger.info("model pools has files: [%s]" %
                            (", ".join(model_pools)))
Exemple #12
0
def train(epochs=200):
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file, init_params={"epoch": 0, "global_step": 0})
    global_epoch, global_step_val = int(params["epoch"]), int(params["global_step"])
    """Train for a number of steps."""
    with tf.Graph().as_default(), tf.device('/job:ps/task:0/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(global_step_val), trainable=False)

        # Calculate the learning rate schedule.
        num_batchs_per_epochs = corpus.num_batchs_per_epochs(BATCH_SIZE)
        print("num_batches_per_epoch: %d" % num_batchs_per_epochs)
        decay_steps = int(num_batchs_per_epochs * NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # Calculate the gradients for each model tower.
        tower_grads = []
        tower_acc = []
        for i in xrange(len(CLUSTER_CONFIG["worker_hosts"])):
            gpu_device = CLUSTER_CONFIG["worker_hosts"][i][1]
            with tf.device('/job:worker/task:%d/%s' % (i, gpu_device)):
                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                    # all towers.
                    loss = tower_loss(scope, CLUSTER_CONFIG["worker_hosts"][i][2])

                    # all accuracy
                    tower_acc.append(tf.get_collection('accuracy', scope)[0])

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # average accuracy
        accuracy = tf.add_n(tower_acc) / len(tower_acc)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)


        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.histogram_summary(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        sess = tf.Session("grpc://" + CLUSTER_CONFIG["worker_hosts"][0][0], config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=True,
            gpu_options=gpu_options))
        sess.run(init)

        # restore model
        restore_model(sess, train_dir, saver)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        graph_def = sess.graph.as_graph_def(add_shapes=True)
        summary_writer = tf.train.SummaryWriter(train_dir,
                                                graph_def=graph_def)

        avg_loss, avg_acc = [0] * num_batchs_per_epochs, [0] * num_batchs_per_epochs
        epochs_step = global_epoch + 1
        step = 0
        while epochs_step <= (global_epoch + epochs):
            step += 1
            start_time = time.time()
            _, loss_value, acc_value, global_step_val = sess.run([train_op, loss, accuracy, global_step])
            elapsed_time = int((time.time() - start_time) * 1000)

            avg_loss[step % num_batchs_per_epochs] = loss_value
            avg_acc[step % num_batchs_per_epochs] = acc_value

            global_step_val = int(global_step_val)
            if global_step_val % 2 == 0:
                logger.info("train policy dl dist network, epoch=%d, step=%d, loss=%.6f, acc=%.6f, time=%d(ms)" % (
                    epochs_step, step, loss_value, acc_value, elapsed_time))

            if global_step_val % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step > num_batchs_per_epochs:
                step = step % num_batchs_per_epochs
                epochs_step += 1
                average_loss = sum(avg_loss) / len(avg_loss)
                average_acc = sum(avg_acc) / len(avg_acc)

                logger.info("train policy dl dist network, epochs=%d, average_loss=%.7f, average_acc=%.7f" %
                            (epochs_step, average_loss, average_acc))
            # Save the model checkpoint periodically.
            if step % num_batchs_per_epochs == 0 and epochs_step % 20 == 0:
                param_serierlize(param_file, {"epoch": int(epochs_step), "global_step": int(global_step_val)})
                filename = save_model(sess, train_dir, saver,
                                      "policy_dl_epoch_%d" % epochs_step,
                                      global_step=global_step_val)
                logger.info("save policy dl dist model: %s" % filename)
Exemple #13
0
def train_policy_network_rl_distribute(args):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    # hosts
    ps_hosts = args.ps_hosts.split(",")
    worker_hosts = args.worker_hosts.split(",")
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=args.job_name,
                             task_index=args.task_index)
    if args.job_name == "ps":
        server.join()
    elif args.job_name == "worker":
        if args.policy_rl_reset:
            # empty old rl policy network
            if os.path.exists(args.policy_rl_models_dir):
                # os.removedirs(args.policy_rl_models_dir)
                shutil.rmtree(args.policy_rl_models_dir)
            os.makedirs(args.policy_rl_models_dir)
            # read parameters from DL policy network
            checkpoint = tf.train.get_checkpoint_state(
                args.policy_dl_models_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                model_file = checkpoint.model_checkpoint_path
            else:
                logger.error("not found policy dl model avaliable",
                             to_exit=True)
        else:
            model_file = None
        # init policy RL network
        policy_rl = PolicyRLNetwork(
            policy_planes,
            args,
            phase=args.policy_rl_phase,
            filters=args.policy_rl_filters,
            board_size=args.board_size,
            model_dir=args.policy_rl_models_dir,
            gpu=args.policy_rl_gpu,
            optimizer=args.policy_rl_optimizer,
            learn_rate=args.policy_rl_learn_rate,
            distributed_train=True,
        )
        init_op = tf.initialize_all_variables()
        summary_op = tf.merge_all_summaries()

        sv = tf.train.Supervisor(is_chief=(args.task_index == 0),
                                 logdir=policy_rl.model_dir,
                                 init_op=init_op,
                                 summary_op=summary_op,
                                 saver=policy_rl.saver,
                                 global_step=policy_rl.global_step,
                                 save_model_secs=0)
        sess = sv.prepare_or_wait_for_session(server.target,
                                              config=tf.ConfigProto(
                                                  allow_soft_placement=True,
                                                  log_device_placement=True))
        sess.run(init_op)
        # Start queue runners for the input pipelines (if any).
        sv.start_queue_runners(sess)
        policy_rl.set_session(sess)
        if model_file is not None:
            policy_rl.saver.restore(sess, model_file)
            logger.info("load model file: %s" % model_file)
        else:
            policy_rl.restore_model()
        # load value network
        if args.policy_rl_phase > 1:
            value_dl = ValueNetwork(
                value_planes,
                phase=args.values_net_phase,
                filters=args.values_net_filters,
                board_size=args.board_size,
                model_dir=args.values_net_models_dir,
                gpu=args.values_net_gpu,
                optimizer=args.values_net_optimizer,
                learn_rate=args.values_net_learn_rate,
            )
        else:
            value_dl = None
        # train policy rl
        policy_rl.train_policy_network(value_dl,
                                       epochs=args.policy_rl_epochs,
                                       batch_games=args.policy_rl_batch_games,
                                       save_step=args.policy_rl_save_step)
Exemple #14
0
def load_model(args, model_type, model_file=None):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    pattern_features = args.pattern_features
    if model_type == "policy_dl":
        model = PolicyDLNetwork(
            policy_planes,
            corpus,
            args,
            filters=args.policy_dl_filters,
            board_size=args.board_size,
            model_dir=args.policy_dl_models_dir,
            device="gpu",
            gpu=args.policy_dl_gpu,
            optimizer=args.policy_dl_optimizer,
            learn_rate=args.policy_dl_learn_rate,
            distributed_train=False,
        )
    elif model_type == "policy_rollout":
        model = PolicyRolloutModel(
            policy_planes,
            patterns,
            args,
            board_size=args.board_size,
            model_dir=args.policy_rollout_models_dir,
            device="cpu",
            optimizer=args.policy_rollout_optimizer,
            learn_rate=args.policy_rollout_learn_rate,
            distributed_train=False,
        )
    elif model_type == "policy_rl":
        model = PolicyRLNetwork(
            policy_planes,
            args,
            phase=args.policy_rl_phase,
            filters=args.policy_rl_filters,
            board_size=args.board_size,
            model_dir=args.policy_rl_models_dir,
            device="cpu",
            optimizer=args.policy_rl_optimizer,
            learn_rate=args.policy_rl_learn_rate,
            distributed_train=False,
        )
    elif model_type == "value_net":
        model = ValueNetwork(
            value_planes,
            args,
            phase=args.values_net_phase,
            filters=args.values_net_filters,
            board_size=args.board_size,
            model_dir=args.values_net_models_dir,
            device="cpu",
            optimizer=args.values_net_optimizer,
            learn_rate=args.values_net_learn_rate,
        )
    else:
        logger.error("unsupported model type=%s" % model_type, to_exit=True)
    # init session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                               allow_soft_placement=True,
                                               gpu_options=gpu_options))
    session.run(tf.initialize_all_variables())
    model.set_session(session)
    # restore model
    status = model.restore_model(model_file=model_file)
    if not status and model_type == "policy_rl":
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        model_file = checkpoint.model_checkpoint_path
        logger.info("successful load model file: %s" % model_file)
        model.saver.restore(session, model_file)
    return model
def train_policy_network_rl_distribute(args):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    # hosts
    ps_hosts = args.ps_hosts.split(",")
    worker_hosts = args.worker_hosts.split(",")
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=args.job_name,
                             task_index=args.task_index)
    if args.job_name == "ps":
        server.join()
    elif args.job_name == "worker":
        if args.policy_rl_reset:
            # empty old rl policy network
            if os.path.exists(args.policy_rl_models_dir):
                # os.removedirs(args.policy_rl_models_dir)
                shutil.rmtree(args.policy_rl_models_dir)
            os.makedirs(args.policy_rl_models_dir)
            # read parameters from DL policy network
            checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                model_file = checkpoint.model_checkpoint_path
            else:
                logger.error("not found policy dl model avaliable", to_exit=True)
        else:
            model_file = None
        # init policy RL network
        policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters,
                                    board_size=args.board_size,
                                    model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu,
                                    optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate,
                                    distributed_train=True,
                                    )
        init_op = tf.initialize_all_variables()
        summary_op = tf.merge_all_summaries()

        sv = tf.train.Supervisor(is_chief=(args.task_index == 0),
                                 logdir=policy_rl.model_dir,
                                 init_op=init_op,
                                 summary_op=summary_op,
                                 saver=policy_rl.saver,
                                 global_step=policy_rl.global_step,
                                 save_model_secs=0)
        sess = sv.prepare_or_wait_for_session(server.target,
                                              config=tf.ConfigProto(allow_soft_placement=True,
                                                                    log_device_placement=True)
                                              )
        sess.run(init_op)
        # Start queue runners for the input pipelines (if any).
        sv.start_queue_runners(sess)
        policy_rl.set_session(sess)
        if model_file is not None:
            policy_rl.saver.restore(sess, model_file)
            logger.info("load model file: %s" % model_file)
        else:
            policy_rl.restore_model()
        # load value network
        if args.policy_rl_phase > 1:
            value_dl = ValueNetwork(value_planes, phase=args.values_net_phase, filters=args.values_net_filters,
                                    board_size=args.board_size,
                                    model_dir=args.values_net_models_dir, gpu=args.values_net_gpu,
                                    optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate,
                                    )
        else:
            value_dl = None
        # train policy rl
        policy_rl.train_policy_network(value_dl, epochs=args.policy_rl_epochs,
                                       batch_games=args.policy_rl_batch_games,
                                       save_step=args.policy_rl_save_step)
def train_rl_network(batch_games=128,
                     save_step=10000,
                     max_model_pools=5,
                     init_epsilon=0.5,
                     final_epsilon=0.01,
                     explore=1000000,
                     action_repeat=32,
                     mini_batch_size=64):
    """
        data set from self-play
    :return:
    """
    args = parser_argument().parse_args()
    rpc = ModelRPC(args)
    game = RenjuGame()
    batch_states, batch_actions, batch_rewards = deque(), deque(), deque()
    mini_batch_states, mini_batch_actions, mini_batch_rewards = [
        0
    ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size
    model_pools = []
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file,
                                init_params={
                                    "global_step": 0,
                                    "epsilon": init_epsilon
                                })
    global_step_val, epsilon = params["global_step"], params["epsilon"]
    # load model
    sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network(
    )
    train_step = 0
    while True:
        start_time = time.time()
        # choose policy network for opponent player from model pools
        if train_step % 10 == 0:
            if len(model_pools) > 0:
                model_file = random.choice(model_pools)
            else:
                model_file = None
            rpc.switch_model("policy_rl", model_file=model_file)
        while len(batch_states) < batch_games:
            # opponent_policy = self.load_history_policy_model(model_file)
            black_opponent = random.choice([True, False])
            # reset game
            game.reset_game()
            # simulate game by current parameter
            states, actions, rewards = [], [], []
            state = game.step_games(None)
            while True:  # loop current game
                # self-play, current model V.S. history model
                if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \
                        or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE):
                    predict_probs = rpc.policy_rl_rpc(
                        board_to_stream(game.board), game.get_player_name())
                else:  # current player
                    predict_probs = sess.run(
                        [tower_logits[0]],
                        feed_dict={tower_feeds[0][0]: [state]})[0][0]
                if random.random() < epsilon:  # random choose action
                    action = game.weighted_choose_action(predict_probs)
                else:
                    action = game.choose_action(predict_probs)
                if action is None:
                    final_reward = 0
                    break
                # step game
                state_n, reward_n, terminal_n = game.step_games(action)
                # store (state, action)
                states.append(state)
                actions.append(action)
                # set new states
                state = state_n
                if terminal_n:
                    final_reward = reward_n
                    break
                # check whether game drawn
                if game.random_action(
                ) is None:  # game drawn, equal end, reward=0
                    final_reward = 0
                    logger.info("game drawn, so amazing...")
                    break
            # store (reward)
            for step in xrange(len(states)):
                if step % 2 == 0:
                    rewards.append(final_reward)
                else:
                    rewards.append(-final_reward)
            # store states of ith game
            batch_states.append(states)
            batch_actions.append(actions)
            batch_rewards.append(rewards)
        # fit model by mini batch
        avg_loss, avg_acc = 0.0, 0.0
        for _ in xrange(action_repeat / gpu_num):
            train_step += 1
            feeds = {}
            for gpu_id in xrange(gpu_num):
                for idx in xrange(mini_batch_size):
                    game_idx = random.randint(0, len(batch_states) - 1)
                    game_time_step_idx = random.randint(
                        0,
                        len(batch_states[game_idx]) - 1)
                    mini_batch_states[idx] = batch_states[game_idx][
                        game_time_step_idx]
                    mini_batch_actions[idx] = batch_actions[game_idx][
                        game_time_step_idx]
                    mini_batch_rewards[idx] = batch_rewards[game_idx][
                        game_time_step_idx]
                feeds[tower_feeds[gpu_id][0]] = mini_batch_states
                feeds[tower_feeds[gpu_id][1]] = mini_batch_actions
                feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards
            _, global_step_val, loss_val, acc_val = sess.run(
                [train_op, global_step, loss, accuracy], feed_dict=feeds)
            avg_loss += loss_val
            avg_acc += acc_val
            # update epsilon
            if epsilon > final_epsilon:
                epsilon -= (init_epsilon - final_epsilon) / explore
        avg_loss /= action_repeat
        avg_acc /= action_repeat
        batch_states.popleft()
        batch_actions.popleft()
        batch_rewards.popleft()

        global_step_val = int(global_step_val)
        elapsed_time = int(time.time() - start_time)
        logger.info(
            "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)"
            % (train_step, epsilon, avg_loss, avg_acc, elapsed_time))
        # save model
        if train_step % save_step == 0:
            params["global_step"], params["epsilon"] = global_step_val, epsilon
            param_serierlize(param_file, params)
            model_file = save_model(sess,
                                    train_dir,
                                    saver,
                                    "policy_rl_step_%d" % train_step,
                                    global_step=global_step_val)
            logger.info("save policy rl model, file=%s" % model_file)
            model_file = model_file[len(train_dir):]
            # add history model to pool
            model_pools.append(model_file)
            if len(model_pools
                   ) > max_model_pools:  # pop head when model pools exceed
                model_pools.pop(0)
            logger.info("model pools has files: [%s]" %
                        (", ".join(model_pools)))
Exemple #17
0
def train_rl_network(batch_games=128, save_step=10000,
                     max_model_pools=5, init_epsilon=0.5, final_epsilon=0.01, explore=1000000,
                     action_repeat=32, mini_batch_size=64):
    """
        data set from self-play
    :return:
    """
    args = parser_argument().parse_args()
    rpc = ModelRPC(args)
    game = RenjuGame()
    batch_states, batch_actions, batch_rewards = deque(), deque(), deque()
    mini_batch_states, mini_batch_actions, mini_batch_rewards = [0] * mini_batch_size, [0] * mini_batch_size, [
        0] * mini_batch_size
    model_pools = []
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file, init_params={"global_step": 0, "epsilon": init_epsilon})
    global_step_val, epsilon = params["global_step"], params["epsilon"]
    # load model
    sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network()
    train_step = 0
    while True:
        start_time = time.time()
        # choose policy network for opponent player from model pools
        if train_step % 10 == 0:
            if len(model_pools) > 0:
                model_file = random.choice(model_pools)
            else:
                model_file = None
            rpc.switch_model("policy_rl", model_file=model_file)
        while len(batch_states) < batch_games:
            # opponent_policy = self.load_history_policy_model(model_file)
            black_opponent = random.choice([True, False])
            # reset game
            game.reset_game()
            # simulate game by current parameter
            states, actions, rewards = [], [], []
            state = game.step_games(None)
            while True:  # loop current game
                # self-play, current model V.S. history model
                if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \
                        or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE):
                    predict_probs = rpc.policy_rl_rpc(board_to_stream(game.board), game.get_player_name())
                else:  # current player
                    predict_probs = sess.run([tower_logits[0]], feed_dict={tower_feeds[0][0]: [state]})[0][0]
                if random.random() < epsilon:  # random choose action
                    action = game.weighted_choose_action(predict_probs)
                else:
                    action = game.choose_action(predict_probs)
                if action is None:
                    final_reward = 0
                    break
                # step game
                state_n, reward_n, terminal_n = game.step_games(action)
                # store (state, action)
                states.append(state)
                actions.append(action)
                # set new states
                state = state_n
                if terminal_n:
                    final_reward = reward_n
                    break
                # check whether game drawn
                if game.random_action() is None:  # game drawn, equal end, reward=0
                    final_reward = 0
                    logger.info("game drawn, so amazing...")
                    break
            # store (reward)
            for step in xrange(len(states)):
                if step % 2 == 0:
                    rewards.append(final_reward)
                else:
                    rewards.append(-final_reward)
            # store states of ith game
            batch_states.append(states)
            batch_actions.append(actions)
            batch_rewards.append(rewards)
        # fit model by mini batch
        avg_loss, avg_acc = 0.0, 0.0
        for _ in xrange(action_repeat / gpu_num):
            train_step += 1
            feeds = {}
            for gpu_id in xrange(gpu_num):
                for idx in xrange(mini_batch_size):
                    game_idx = random.randint(0, len(batch_states) - 1)
                    game_time_step_idx = random.randint(0, len(batch_states[game_idx]) - 1)
                    mini_batch_states[idx] = batch_states[game_idx][game_time_step_idx]
                    mini_batch_actions[idx] = batch_actions[game_idx][game_time_step_idx]
                    mini_batch_rewards[idx] = batch_rewards[game_idx][game_time_step_idx]
                feeds[tower_feeds[gpu_id][0]] = mini_batch_states
                feeds[tower_feeds[gpu_id][1]] = mini_batch_actions
                feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards
            _, global_step_val, loss_val, acc_val = sess.run([train_op, global_step, loss, accuracy], feed_dict=feeds)
            avg_loss += loss_val
            avg_acc += acc_val
            # update epsilon
            if epsilon > final_epsilon:
                epsilon -= (init_epsilon - final_epsilon) / explore
        avg_loss /= action_repeat
        avg_acc /= action_repeat
        batch_states.popleft()
        batch_actions.popleft()
        batch_rewards.popleft()

        global_step_val = int(global_step_val)
        elapsed_time = int(time.time() - start_time)
        logger.info(
            "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" %
            (train_step, epsilon, avg_loss, avg_acc, elapsed_time))
        # save model
        if train_step % save_step == 0:
            params["global_step"], params["epsilon"] = global_step_val, epsilon
            param_serierlize(param_file, params)
            model_file = save_model(sess, train_dir, saver,
                                    "policy_rl_step_%d" % train_step,
                                    global_step=global_step_val)
            logger.info("save policy rl model, file=%s" % model_file)
            model_file = model_file[len(train_dir):]
            # add history model to pool
            model_pools.append(model_file)
            if len(model_pools) > max_model_pools:  # pop head when model pools exceed
                model_pools.pop(0)
            logger.info("model pools has files: [%s]" % (", ".join(model_pools)))
def train(epochs=200, predict=False):
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file, init_params={"epoch": 0, "global_step": 0})
    global_epoch, global_step_val = int(params["epoch"]), int(params["global_step"])
    """Train for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(global_step_val), trainable=False)

        # Calculate the learning rate schedule.
        num_batchs_per_epochs = int(corpus.num_batchs_per_epochs(BATCH_SIZE))
        print("num_batches_per_epoch: %d" % num_batchs_per_epochs)
        decay_steps = int(num_batchs_per_epochs / gpu_num * NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        # opt = tf.train.GradientDescentOptimizer(lr)
        opt = tf.train.AdamOptimizer(lr)

        # Calculate the gradients for each model tower.
        tower_grads = []
        tower_acc = []
        tower_feeds = []
        for i in xrange(gpu_num):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                    # all towers.
                    batch_input = tf.placeholder(tf.float32, [None, 15*15*planes])
                    batch_labels = tf.placeholder(tf.float32, shape=[None])
                    tower_feeds.append((batch_input, batch_labels))
                    loss = tower_loss(scope, batch_input, batch_labels)

                    # all accuracy
                    tower_acc.append(tf.get_collection('accuracy', scope)[0])

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # average accuracy
        accuracy = tf.add_n(tower_acc) / len(tower_acc)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)


        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.histogram_summary(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        # saver = tf.train.Saver(tf.all_variables())
        saver = tf.train.Saver()

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_MEMERY_ALLOCATE)
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            gpu_options=gpu_options)
        )
        sess.run(init)

        # restore model
        restore_model(sess, train_dir, saver)
        if predict:
            return sess, saver

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        graph_def = sess.graph.as_graph_def(add_shapes=True)
        summary_writer = tf.train.SummaryWriter(train_dir,
                                                graph_def=graph_def)

        avg_loss, avg_acc = [], []
        epochs_step = global_epoch + 1
        step = 0
        while epochs_step <= (global_epoch + epochs):
            step += gpu_num
            start_time = time.time()
            # _, loss_value, acc_value, global_step_val = sess.run([train_op, loss, accuracy, global_step])
            feeds = {}
            for idx in xrange(gpu_num):
                samples = corpus.next_fetch_rows(BATCH_SIZE)
                feature = np.array([sample[0].get_states(flatten=True) for sample in samples], dtype=np.float32)
                labels = np.array([sample[1] for sample in samples], dtype=np.float32)
                feeds[tower_feeds[idx][0]] = feature
                feeds[tower_feeds[idx][1]] = labels
            _, loss_value, acc_value, global_step_val, learn_rating = sess.run(
                [train_op, loss, accuracy, global_step, lr],
                feed_dict=feeds)
            elapsed_time = int((time.time() - start_time) * 1000)

            avg_loss.append(loss_value)
            avg_acc.append(acc_value)

            global_step_val = int(global_step_val)
            if global_step_val % 10 == 0:
                logger.info(
                    "train policy rollout multi_GPU network, epoch=%d, step=%d, loss=%.6f, acc=%.6f, lr=%.6f, time=%d(ms)" % (
                        epochs_step, step, loss_value, acc_value, learn_rating, elapsed_time))

            # if global_step_val % 100 == 0:
            #     summary_str = sess.run(summary_op)
            #     summary_writer.add_summary(summary_str, step)
            if step > num_batchs_per_epochs:
                step = step % num_batchs_per_epochs
                epochs_step += 1
                average_loss = sum(avg_loss) / len(avg_loss)
                average_acc = sum(avg_acc) / len(avg_acc)
                avg_loss, avg_acc = [], []

                logger.info("train policy rollout multi_GPU network, epochs=%d, average_loss=%.7f, average_acc=%.7f" %
                            (epochs_step, average_loss, average_acc))
                # Save the model checkpoint periodically.
                if epochs_step % 5 == 0:
                    param_serierlize(param_file, {"epoch": int(epochs_step), "global_step": int(global_step_val)})
                    filename = save_model(sess, train_dir, saver,
                                          "policy_rollout_epoch_%d" % epochs_step,
                                          global_step=global_step_val)
                    logger.info("save policy rollout multi_GPU model: %s" % filename)
Exemple #19
0
def train(epochs=200):
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file,
                                init_params={
                                    "epoch": 0,
                                    "global_step": 0
                                })
    global_epoch, global_step_val = int(params["epoch"]), int(
        params["global_step"])
    """Train for a number of steps."""
    with tf.Graph().as_default(), tf.device('/job:ps/task:0/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(global_step_val),
            trainable=False)

        # Calculate the learning rate schedule.
        num_batchs_per_epochs = corpus.num_batchs_per_epochs(BATCH_SIZE)
        print("num_batches_per_epoch: %d" % num_batchs_per_epochs)
        decay_steps = int(num_batchs_per_epochs * NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # Calculate the gradients for each model tower.
        tower_grads = []
        tower_acc = []
        for i in xrange(len(CLUSTER_CONFIG["worker_hosts"])):
            gpu_device = CLUSTER_CONFIG["worker_hosts"][i][1]
            with tf.device('/job:worker/task:%d/%s' % (i, gpu_device)):
                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                    # all towers.
                    loss = tower_loss(scope,
                                      CLUSTER_CONFIG["worker_hosts"][i][2])

                    # all accuracy
                    tower_acc.append(tf.get_collection('accuracy', scope)[0])

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # average accuracy
        accuracy = tf.add_n(tower_acc) / len(tower_acc)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.histogram_summary(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        sess = tf.Session("grpc://" + CLUSTER_CONFIG["worker_hosts"][0][0],
                          config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=True,
                                                gpu_options=gpu_options))
        sess.run(init)

        # restore model
        restore_model(sess, train_dir, saver)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        graph_def = sess.graph.as_graph_def(add_shapes=True)
        summary_writer = tf.train.SummaryWriter(train_dir, graph_def=graph_def)

        avg_loss, avg_acc = [0] * num_batchs_per_epochs, [
            0
        ] * num_batchs_per_epochs
        epochs_step = global_epoch + 1
        step = 0
        while epochs_step <= (global_epoch + epochs):
            step += 1
            start_time = time.time()
            _, loss_value, acc_value, global_step_val = sess.run(
                [train_op, loss, accuracy, global_step])
            elapsed_time = int((time.time() - start_time) * 1000)

            avg_loss[step % num_batchs_per_epochs] = loss_value
            avg_acc[step % num_batchs_per_epochs] = acc_value

            global_step_val = int(global_step_val)
            if global_step_val % 2 == 0:
                logger.info(
                    "train policy dl dist network, epoch=%d, step=%d, loss=%.6f, acc=%.6f, time=%d(ms)"
                    % (epochs_step, step, loss_value, acc_value, elapsed_time))

            if global_step_val % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step > num_batchs_per_epochs:
                step = step % num_batchs_per_epochs
                epochs_step += 1
                average_loss = sum(avg_loss) / len(avg_loss)
                average_acc = sum(avg_acc) / len(avg_acc)

                logger.info(
                    "train policy dl dist network, epochs=%d, average_loss=%.7f, average_acc=%.7f"
                    % (epochs_step, average_loss, average_acc))
            # Save the model checkpoint periodically.
            if step % num_batchs_per_epochs == 0 and epochs_step % 20 == 0:
                param_serierlize(param_file, {
                    "epoch": int(epochs_step),
                    "global_step": int(global_step_val)
                })
                filename = save_model(sess,
                                      train_dir,
                                      saver,
                                      "policy_dl_epoch_%d" % epochs_step,
                                      global_step=global_step_val)
                logger.info("save policy dl dist model: %s" % filename)