コード例 #1
0
ファイル: validation.py プロジェクト: JRConti/TapnSwap-RL
    def delete_temp(self, old_model, temp_model):
        """
    Delete temporary files in case of 2 versions of same model 
    (but different times of training). Keep the best model and 
    delete the rest.

    Parameters
    ----------
    old_model: string
      Model filename before training.
    temp_model: string
      Temporary model filename (after training).

    Return
    ------
    use_training: boolean
      False only if old model wins against new temp model.
      True otherwise.
    """

        use_training = True
        # Several versions of same model
        if (old_model is not None) and (temp_model == old_model + '_temp'):
            # Confront them
            agent1 = RLAgent()
            agent1.load_model(old_model)
            agent2 = RLAgent()
            agent2.load_model(temp_model)
            results = compare_agents(agent1,
                                     agent2,
                                     n_games=10,
                                     time_limit=100,
                                     verbose=False)

            # Keep best
            if results[3] >= results[2]:
                # More trained agent is the best
                os.remove('Models/' + old_model + '.csv')
                os.remove('Models/data/count_' + old_model + '.csv')
                os.rename(r'Models/' + temp_model + '.csv',
                          r'Models/' + old_model + '.csv')
                os.rename(r'Models/data/count_' + temp_model + '.csv',
                          r'Models/data/count_' + old_model + '.csv')
            else:
                # Less trained agent is the best
                os.remove('Models/' + temp_model + '.csv')
                os.remove('Models/data/count_' + temp_model + '.csv')
                use_training = False
        return use_training
コード例 #2
0
def getAgentsAndAdjudicator():
    agents = [None, None]

    idx = np.random.randint(2)

    agents[idx] = RLAgent(idx + 1, None)
    agents[1 - idx], adjudicator = getOpponentRandomly(2 - idx)

    # print("RL Agent: (%d, %d) Opponent: (%d, %d)" %(idx, idx+1, 1-idx, 2-idx))
    return agents, adjudicator, idx + 1
コード例 #3
0
ファイル: main.py プロジェクト: G-Flicker/HelloWorld
def main(args, prt):
    config = tf.ConfigProto() #tensorflow的参数配置对象
    config.gpu_options.allow_growth = True #动态申请显存,需要多少就申请多少
    sess = tf.Session(config=config)

    # 加载任务特定的类
    DataGenerator, Env, reward_func, AttentionActor, AttentionCritic = \
        load_task_specific_components(args['task_name'])

    dataGen = DataGenerator(args)
    dataGen.reset()
    env = Env(args)

    #创建一个 RL(强化学习)代理
    agent = RLAgent(args,
                    prt,
                    env,
                    dataGen,
                    reward_func,
                    AttentionActor,
                    AttentionCritic,
                    is_train=args['is_train'])
    agent.Initialize(sess)

    #训练或评估
    start_time = time.time() # 计算时间用的,记录开始时间
    if args['is_train']:# 如果参数是训练
        prt.print_out('训练开始')
        train_time_beg = time.time() # 开始训练的时间
        for step in range(args['n_train']):
            summary = agent.run_train_step()
            _, _ , actor_loss_val, critic_loss_val, actor_gra_and_var_val, critic_gra_and_var_val,\
                R_val, v_val, logprobs_val,probs_val, actions_val, idxs_val= summary

            if step%args['save_interval'] == 0:
                agent.saver.save(sess,args['model_dir']+'/model.ckpt', global_step=step)#sess是之前tensorflow的初始化对象

            if step%args['log_interval'] == 0:
                train_time_end = time.time()-train_time_beg
                prt.print_out('训练步数: {} -- 时间: {} -- 训练奖励: {} -- 值: {}'\
                      .format(step,time.strftime("%H:%M:%S", time.gmtime(\
                        train_time_end)),np.mean(R_val),np.mean(v_val)))
                prt.print_out('    actor loss: {} -- critic loss: {}'\
                      .format(np.mean(actor_loss_val),np.mean(critic_loss_val)))
                train_time_beg = time.time()
            if step%args['test_interval'] == 0:
                agent.inference(args['infer_type'])

    else: # 否则就是在推论
        prt.print_out('评估开始')
        agent.inference(args['infer_type'])


    prt.print_out('总时间为: {}'.format(\
        time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time)))) # 运行完毕之后输出运行总时间
コード例 #4
0
def main():
    env = Environment(config2)
    from agent import actions
    state_size = (config2.vision_range*2 + 1)**2 * config2.color_num_dims + config2.scent_num_dims + len(actions)
    agent = RLAgent(env, state_size=state_size)

    optimizer = optim.Adam(agent.policy.parameters(),
        lr=agent_config['learning_rate'])

    setup_output_dir()
    train(agent, env, [0, 1, 2, 3], optimizer)
コード例 #5
0
ファイル: test.py プロジェクト: amalnanavati/NELQlearning
def main():
    env = Environment(config2)
    state_size = (config2.vision_range * 2 +
                  1)**2 * config2.color_num_dims + config2.scent_num_dims
    agent = RLAgent(env, state_size=state_size)
    agent._load("outputs/models/NELQ_190000")
    # ._load("NELQ.model")

    # optimizer = optim.Adam(agent.policy.parameters())
    # print list(agent.policy.parameters())
    test(agent, env)
コード例 #6
0
ファイル: test.py プロジェクト: clairvoyant/Tic-Tac-Toe
import torch
import random
from game import Game
from agent import RLAgent
from stupid_ai import StupidAI

game = Game()
agent = RLAgent()
stupid_ai = StupidAI()
agent.state_values = torch.load('./model.pth')


def check_board():
    win_or_tie = True
    if game.who_wins() == -1:  # Human win
        print('==================> You win!')
        game.clear()
    elif game.who_wins() == 1:  # AI win
        print('==================> You lose..')
        game.clear()
    elif game.who_wins() == 2:  # Tie
        print('==================> Tie!')
        game.clear()
    else:
        win_or_tie = False
    return win_or_tie


while True:
    x, y = stupid_ai.human_move()
    if game.board[y, x] != 0:
コード例 #7
0
ファイル: validation.py プロジェクト: JRConti/TapnSwap-RL
    def tournament(self, change_opp=False):
        """
    Method to rank the different models obtained after any training. 
    For the values of the factor epsilon of an RL Agent, declared 
    in init method, this method creates a tournament for the 
    corresponding different models. Each model plays 10 games 
    against all others and the scores of each model against another 
    are stored in a CSV file. A TXT file is also generated using
    the CSV file: it displays rankings of each model, alongside
    its total score against all other models.

    Parameter
    ---------
    change_opp: boolean
      Set to True to consider agents trained with mixed opponents
      participating to the tournament.

    Outputs
    -------
    Tournament report: CSV file
      Located at: 'Models/results/(self.tournament_name).csv'.
      File storing the results of each confrontation between 2 
      agents.
    Tournament ranking: TXT file
      Located at: 'Models/results/(self.tournament_name).txt'.
      File ranking the agents using the results of the tournament,
      with total score of each agent displayed.
    """

        n_players = len(self.epsilon_values) * (
            (int(self.random_training) + int(self.self_training)) *
            (1 + int(change_opp)))

        print('-----------------------------')
        print('TOURNAMENT with {} agents'.format(n_players))
        print('-----------------------------\n')

        # Initialization of scores: some rows and columns are
        # only used for saving configurations of models
        # (epsilon, opponents, change of opponent).
        scores = -np.ones((n_players + 3, n_players + 3))

        # List of opponent kinds
        training_ways = []
        if self.random_training:
            training_ways.append('Random')
            if change_opp:
                training_ways.append('RandomvsSelf')
        if self.self_training:
            training_ways.append('Self')
            if change_opp:
                training_ways.append('SelfvsRandom')

        # List of players
        players = [[epsilon, training_way] for epsilon in self.epsilon_values
                   for training_way in training_ways]

        for idx1, player1 in enumerate(players):
            epsilon1 = player1[0]
            training_way1 = player1[1]
            filename = ('greedy' + str(epsilon1)[0] + '_' + str(epsilon1)[2:] +
                        '_vs' + training_way1)

            # Load first agent
            agent1 = RLAgent()
            agent1.load_model(filename)

            # Save config of agent1
            scores[idx1 + 3, 0] = epsilon1
            # 0: RANDOM | 1: SELF
            scores[idx1 + 3, 1] = (int(training_way1 == 'Self') +
                                   int(training_way1 == 'SelfvsRandom'))
            # -1: nothing | 0: Random vs Self | 1: Self vs Random
            scores[idx1 + 3,
                   2] = -1 + (2 * int(training_way1 == 'SelfvsRandom') +
                              int(training_way1 == 'RandomvsSelf'))

            for idx2, player2 in enumerate(players):
                epsilon2 = player2[0]
                training_way2 = player2[1]
                filename = ('greedy' + str(epsilon2)[0] + '_' +
                            str(epsilon2)[2:] + '_vs' + training_way2)

                # Load second agent
                agent2 = RLAgent()
                agent2.load_model(filename)

                # Save config of agent2
                scores[0, idx2 + 3] = epsilon2
                scores[1, idx2 + 3] = (int(training_way2 == 'Self') +
                                       int(training_way2 == 'SelfvsRandom'))
                scores[2, idx2 +
                       3] = -1 + (2 * int(training_way2 == 'SelfvsRandom') +
                                  int(training_way2 == 'RandomvsSelf'))

                print('Current match:')
                print('Player1: epsilon = {}, trained vs {}'.format(
                    epsilon1, training_way1))
                print('Player2: epsilon = {}, trained vs {}'.format(
                    epsilon2, training_way2))

                results = compare_agents(agent1,
                                         agent2,
                                         n_games=10,
                                         time_limit=100,
                                         verbose=False)

                # Score of agent1
                scores[idx1 + 3, idx2 + 3] = results[2]
                # Score of agent2
                scores[idx2 + 3, idx1 + 3] = results[3]

                print('------')

        # Update tournament file name
        name = self.tournament_name[:-1]
        nbr = int(self.tournament_name[-1])
        nbr += 1
        self.tournament_name = name + str(nbr)

        # Save tournament
        np.savetxt(str('Models/results/' + self.tournament_name + '.csv'),
                   scores,
                   delimiter=',')

        # Rank players
        self.tournament_ranking(self.tournament_name, self.tournament_name)

        print('Results of tournament are stored in {}.csv and {}.txt\n'.format(
            self.tournament_name, self.tournament_name))
コード例 #8
0
ファイル: game.py プロジェクト: ahmad-asadi/TicTacToeTDL
                    rl_agent.win()
                else:
                    # print("Draw!")
                    rl_agent.lose()
                break

            board.print()

            action = int(input())-1
            board.update_state(player="O", action=action)
            done = board.check()
            if done != 2:
                # if done == -1:
                #     print("Game Over!")
                # elif done == 0:
                #     print("Draw!")
                rl_agent.lose()
                break

            board.print()
        board.print()
    print("V: ", rl_agent.wins, ", L: ", rl_agent.loses," D: ", experiments-rl_agent.loses-rl_agent.wins)

board = Board(print_map=False)
rl_agent = RLAgent(_id = "X")
opponent = Agent(_id = "O")

train(board, rl_agent, opponent)
rl_agent.another_opponent()
test(board, rl_agent)
コード例 #9
0
def train(config):
    # train env
    print('Setting up TextWorld environment...')
    batch_size = config['training']['scheduling']['batch_size']
    env_id = gym_textworld.make_batch(env_id=config['general']['env_id'],
                                      batch_size=batch_size,
                                      parallel=True)
    env = gym.make(env_id)
    env.seed(config['general']['random_seed'])

    # valid and test env
    run_test = config['general']['run_test']
    if run_test:
        test_batch_size = config['training']['scheduling']['test_batch_size']
        # valid
        valid_env_name = config['general']['valid_env_id']

        valid_env_id = gym_textworld.make_batch(env_id=valid_env_name,
                                                batch_size=test_batch_size,
                                                parallel=True)
        valid_env = gym.make(valid_env_id)
        valid_env.seed(config['general']['random_seed'])

        # test
        test_env_name_list = config['general']['test_env_id']
        assert isinstance(test_env_name_list, list)

        test_env_id_list = [
            gym_textworld.make_batch(env_id=item,
                                     batch_size=test_batch_size,
                                     parallel=True)
            for item in test_env_name_list
        ]
        test_env_list = [
            gym.make(test_env_id) for test_env_id in test_env_id_list
        ]
        for i in range(len(test_env_list)):
            test_env_list[i].seed(config['general']['random_seed'])
    print('Done.')

    # Set the random seed manually for reproducibility.
    np.random.seed(config['general']['random_seed'])
    torch.manual_seed(config['general']['random_seed'])
    if torch.cuda.is_available():
        if not config['general']['use_cuda']:
            logger.warning(
                "WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml"
            )
        else:
            torch.backends.cudnn.deterministic = True
            torch.cuda.manual_seed(config['general']['random_seed'])
    else:
        config['general']['use_cuda'] = False  # Disable CUDA.
    revisit_counting = config['general']['revisit_counting']
    replay_batch_size = config['general']['replay_batch_size']
    replay_memory_capacity = config['general']['replay_memory_capacity']
    replay_memory_priority_fraction = config['general'][
        'replay_memory_priority_fraction']

    word_vocab = dict2list(env.observation_space.id2w)
    word2id = {}
    for i, w in enumerate(word_vocab):
        word2id[w] = i

    # collect all nouns
    verb_list = ["go", "take"]
    object_name_list = ["east", "west", "north", "south", "coin"]
    verb_map = [word2id[w] for w in verb_list if w in word2id]
    noun_map = [word2id[w] for w in object_name_list if w in word2id]
    agent = RLAgent(
        config,
        word_vocab,
        verb_map,
        noun_map,
        replay_memory_capacity=replay_memory_capacity,
        replay_memory_priority_fraction=replay_memory_priority_fraction)

    init_learning_rate = config['training']['optimizer']['learning_rate']

    exp_dir = get_experiment_dir(config)
    summary = SummaryWriter(exp_dir)

    parameters = filter(lambda p: p.requires_grad, agent.model.parameters())
    if config['training']['optimizer']['step_rule'] == 'sgd':
        optimizer = torch.optim.SGD(parameters, lr=init_learning_rate)
    elif config['training']['optimizer']['step_rule'] == 'adam':
        optimizer = torch.optim.Adam(parameters, lr=init_learning_rate)

    log_every = 100
    reward_avg = SlidingAverage('reward avg', steps=log_every)
    step_avg = SlidingAverage('step avg', steps=log_every)
    loss_avg = SlidingAverage('loss avg', steps=log_every)

    # save & reload checkpoint only in 0th agent
    best_avg_reward = -10000
    best_avg_step = 10000

    # step penalty
    discount_gamma = config['general']['discount_gamma']
    provide_prev_action = config['general']['provide_prev_action']

    # epsilon greedy
    epsilon_anneal_epochs = config['general']['epsilon_anneal_epochs']
    epsilon_anneal_from = config['general']['epsilon_anneal_from']
    epsilon_anneal_to = config['general']['epsilon_anneal_to']

    # counting reward
    revisit_counting_lambda_anneal_epochs = config['general'][
        'revisit_counting_lambda_anneal_epochs']
    revisit_counting_lambda_anneal_from = config['general'][
        'revisit_counting_lambda_anneal_from']
    revisit_counting_lambda_anneal_to = config['general'][
        'revisit_counting_lambda_anneal_to']

    epsilon = epsilon_anneal_from
    revisit_counting_lambda = revisit_counting_lambda_anneal_from
    for epoch in range(config['training']['scheduling']['epoch']):

        agent.model.train()
        obs, infos = env.reset()
        agent.reset(infos)
        print_command_string, print_rewards = [[] for _ in infos
                                               ], [[] for _ in infos]
        print_interm_rewards = [[] for _ in infos]
        print_rc_rewards = [[] for _ in infos]

        dones = [False] * batch_size
        rewards = None
        avg_loss_in_this_game = []

        new_observation_strings = agent.get_observation_strings(infos)
        if revisit_counting:
            agent.reset_binarized_counter(batch_size)
            revisit_counting_rewards = agent.get_binarized_count(
                new_observation_strings)

        current_game_step = 0
        prev_actions = ["" for _ in range(batch_size)
                        ] if provide_prev_action else None
        input_description, description_id_list = agent.get_game_step_info(
            obs, infos, prev_actions)

        while not all(dones):

            v_idx, n_idx, chosen_strings, state_representation = agent.generate_one_command(
                input_description, epsilon=epsilon)
            obs, rewards, dones, infos = env.step(chosen_strings)
            new_observation_strings = agent.get_observation_strings(infos)
            if provide_prev_action:
                prev_actions = chosen_strings
            # counting
            if revisit_counting:
                revisit_counting_rewards = agent.get_binarized_count(
                    new_observation_strings, update=True)
            else:
                revisit_counting_rewards = [0.0 for _ in range(batch_size)]
            agent.revisit_counting_rewards.append(revisit_counting_rewards)
            revisit_counting_rewards = [
                float(format(item, ".3f")) for item in revisit_counting_rewards
            ]

            for i in range(len(infos)):
                print_command_string[i].append(chosen_strings[i])
                print_rewards[i].append(rewards[i])
                print_interm_rewards[i].append(infos[i]["intermediate_reward"])
                print_rc_rewards[i].append(revisit_counting_rewards[i])
            if type(dones) is bool:
                dones = [dones] * batch_size
            agent.rewards.append(rewards)
            agent.dones.append(dones)
            agent.intermediate_rewards.append(
                [info["intermediate_reward"] for info in infos])
            # computer rewards, and push into replay memory
            rewards_np, rewards, mask_np, mask = agent.compute_reward(
                revisit_counting_lambda=revisit_counting_lambda,
                revisit_counting=revisit_counting)

            curr_description_id_list = description_id_list
            input_description, description_id_list = agent.get_game_step_info(
                obs, infos, prev_actions)

            for b in range(batch_size):
                if mask_np[b] == 0:
                    continue
                if replay_memory_priority_fraction == 0.0:
                    # vanilla replay memory
                    agent.replay_memory.push(curr_description_id_list[b],
                                             v_idx[b], n_idx[b], rewards[b],
                                             mask[b], dones[b],
                                             description_id_list[b],
                                             new_observation_strings[b])
                else:
                    # prioritized replay memory
                    is_prior = rewards_np[b] > 0.0
                    agent.replay_memory.push(is_prior,
                                             curr_description_id_list[b],
                                             v_idx[b], n_idx[b], rewards[b],
                                             mask[b], dones[b],
                                             description_id_list[b],
                                             new_observation_strings[b])

            if current_game_step > 0 and current_game_step % config["general"][
                    "update_per_k_game_steps"] == 0:
                policy_loss = agent.update(replay_batch_size,
                                           discount_gamma=discount_gamma)
                if policy_loss is None:
                    continue
                loss = policy_loss
                # Backpropagate
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                torch.nn.utils.clip_grad_norm(
                    agent.model.parameters(),
                    config['training']['optimizer']['clip_grad_norm'])
                optimizer.step()  # apply gradients
                avg_loss_in_this_game.append(to_np(policy_loss))
            current_game_step += 1

        agent.finish()
        avg_loss_in_this_game = np.mean(avg_loss_in_this_game)
        reward_avg.add(agent.final_rewards.mean())
        step_avg.add(agent.step_used_before_done.mean())
        loss_avg.add(avg_loss_in_this_game)
        # annealing
        if epoch < epsilon_anneal_epochs:
            epsilon -= (epsilon_anneal_from -
                        epsilon_anneal_to) / float(epsilon_anneal_epochs)
        if epoch < revisit_counting_lambda_anneal_epochs:
            revisit_counting_lambda -= (
                revisit_counting_lambda_anneal_from -
                revisit_counting_lambda_anneal_to
            ) / float(revisit_counting_lambda_anneal_epochs)

        # Tensorboard logging #
        # (1) Log some numbers
        if (epoch + 1
            ) % config["training"]["scheduling"]["logging_frequency"] == 0:
            summary.add_scalar('avg_reward', reward_avg.value, epoch + 1)
            summary.add_scalar('curr_reward', agent.final_rewards.mean(),
                               epoch + 1)
            summary.add_scalar('curr_interm_reward',
                               agent.final_intermediate_rewards.mean(),
                               epoch + 1)
            summary.add_scalar('curr_counting_reward',
                               agent.final_counting_rewards.mean(), epoch + 1)
            summary.add_scalar('avg_step', step_avg.value, epoch + 1)
            summary.add_scalar('curr_step', agent.step_used_before_done.mean(),
                               epoch + 1)
            summary.add_scalar('loss_avg', loss_avg.value, epoch + 1)
            summary.add_scalar('curr_loss', avg_loss_in_this_game, epoch + 1)

        msg = 'E#{:03d}, R={:.3f}/{:.3f}/IR{:.3f}/CR{:.3f}, S={:.3f}/{:.3f}, L={:.3f}/{:.3f}, epsilon={:.4f}, lambda_counting={:.4f}'
        msg = msg.format(epoch, np.mean(reward_avg.value),
                         agent.final_rewards.mean(),
                         agent.final_intermediate_rewards.mean(),
                         agent.final_counting_rewards.mean(),
                         np.mean(step_avg.value),
                         agent.step_used_before_done.mean(),
                         np.mean(loss_avg.value), avg_loss_in_this_game,
                         epsilon, revisit_counting_lambda)
        if (epoch + 1
            ) % config["training"]["scheduling"]["logging_frequency"] == 0:
            print("=========================================================")
            for prt_cmd, prt_rew, prt_int_rew, prt_rc_rew in zip(
                    print_command_string, print_rewards, print_interm_rewards,
                    print_rc_rewards):
                print("------------------------------")
                print(prt_cmd)
                print(prt_rew)
                print(prt_int_rew)
                print(prt_rc_rew)
        print(msg)
        # test on a different set of games
        if run_test and (epoch + 1) % config["training"]["scheduling"][
                "logging_frequency"] == 0:
            valid_R, valid_IR, valid_S = test(config, valid_env, agent,
                                              test_batch_size, word2id)
            summary.add_scalar('valid_reward', valid_R, epoch + 1)
            summary.add_scalar('valid_interm_reward', valid_IR, epoch + 1)
            summary.add_scalar('valid_step', valid_S, epoch + 1)

            # save & reload checkpoint by best valid performance
            model_checkpoint_path = config['training']['scheduling'][
                'model_checkpoint_path']
            if valid_R > best_avg_reward or (valid_R == best_avg_reward
                                             and valid_S < best_avg_step):
                best_avg_reward = valid_R
                best_avg_step = valid_S
                torch.save(agent.model.state_dict(), model_checkpoint_path)
                print("========= saved checkpoint =========")
                for test_id in range(len(test_env_list)):
                    R, IR, S = test(config, test_env_list[test_id], agent,
                                    test_batch_size, word2id)
                    summary.add_scalar('test_reward_' + str(test_id), R,
                                       epoch + 1)
                    summary.add_scalar('test_interm_reward_' + str(test_id),
                                       IR, epoch + 1)
                    summary.add_scalar('test_step_' + str(test_id), S,
                                       epoch + 1)
コード例 #10
0
ファイル: train.py プロジェクト: JRConti/TapnSwap-RL
def train(n_epochs,
          epsilon,
          gamma,
          load_model,
          filename,
          random_opponent,
          n_games_test,
          freq_test,
          n_skip_games=int(0),
          verbose=False):
    """
  Train 2 agents by making them play and learn together. Save the
  learned Q-function into CSV file. It is possible to confront 1 of 
  the agents (against either the user or a Random Agent) during 
  training, as often as one wants. It is also possible to train an already 
  trained model.

  Parameters
  ----------
  n_epochs: int
    Number of games used for training.
  epsilon: float (in [0,1])
    Fraction of greedy decisions during training of the 2 RL Agents.
  gamma: float (in [0,1])
    Factor of significance of first actions over last ones for the 
    2 RL Agents.
  load_model: string
    CSV filename in which is stored the learned Q-function of an 
    agent. If load_model = 'model', the function loads the model 
    './Models/model.csv'. If load_model is not None, the previous 
    parameters epsilon and gamma are used for a second training.
  filename: string
    Name of the CSV file that will store the learned Q-function 
    of one of the agents. The path to CSV file is 
    then ./Models/filename.csv. The counter of state-action
    pairs is also stored at ./Models/data/count_filename.csv for
    future training.
  random_opponent: boolean
    If set to true, the function trains 1 RL Agent by making it 
    play against a Random Agent. Otherwise, the RL agent is
    trained by playing against another version of itself.
  n_games_test: int
    Number of games one of the RL Agent plays against a Random Agent
    for testing. If set to 0, the RL Agents will not be tested by a 
    Random Agent. 
  freq_test: int
    Number of epochs after which one of the RL Agents plays n_games_test
    games against a Random Agent. If set to 1000, each 1000 epochs of
    training, one of the RL Agents is tested against a Random Agent.
    If set to 0, test occurs at the last epoch of training only.
    If set to -1, none of the agents is tested during training.
  n_skip_games: int 
    Number of epochs after which the user can choose to play 
    against one of the learning agents. If set to 1000, 
    each 1000 games, the user can choose to play against 
    one agent. If set to 0, the user can choose to play against one 
    agent at the last epoch only. If set to -1, no choice is offered 
    and the user cannot test any agent.
  verbose: boolean
    If set to True, each game action during training has a 
    written explanation.

  Return
  ------
  learning_results: list
    Only significant with n_games_test > 0 (otherwise, empty list 
    by default). List of each n_epochs // freq_test epoch test results 
    against a Random Agent. Each test result is a list: 
    [current epoch, score of RL Agent, number of finished games, 
    n_games test].
  """

    # Learning agent
    agent1 = RLAgent(epsilon, gamma)
    if load_model is not None:
        agent1.load_model(load_model)

    # Choose opponent
    if random_opponent:
        agent2 = RandomAgent()
        time_limit = None
        print('Training vs Random')
    else:
        agent2 = RLAgent(epsilon, gamma)
        if load_model is not None:
            agent2.load_model(load_model)
        time_limit = None
        print('Training vs Self')

    start_idx = 0
    scores = [0, 0]

    # If the user only confronts the agent at the last epoch
    # or if no confrontation
    if n_skip_games in [-1, 0]:
        n_skip_games = n_epochs - n_skip_games

    # Boolean for game between the user and agent1 preceding a game
    # between agent1 and agent2
    play_checkpoint_usr = False

    # If there is a test of agent1 at the last epoch only or no test
    if freq_test in [-1, 0]:
        freq_test = n_epochs - freq_test

    # Number of games between agent1 and a Random Agent for testing
    n_games_test_mem = n_games_test
    learning_results = []

    # Start training
    print('Training epoch:')
    for epoch in range(1, n_epochs + 1):

        if epoch % (n_epochs // 10) == 0:
            print(epoch, '/', n_epochs)

        #Update boolean for playing with user
        play_checkpoint_usr = bool(epoch % n_skip_games == 0)
        if play_checkpoint_usr:
            # Print training status
            print('Number of games: ', epoch)
            print('Scores: ', scores)
            # Ask user to play
            play = int(input('Play ? (1 Yes | 0 No)\n'))
            play_checkpoint_usr = bool(play)

        # Update boolean for test
        n_games_test = int(epoch % freq_test == 0) * n_games_test_mem

        # Start game
        game_over, winner, test_results = game_2Agents(
            agent1,
            agent2,
            start_idx=start_idx,
            train=True,
            time_limit=time_limit,
            n_games_test=n_games_test,
            play_checkpoint_usr=play_checkpoint_usr,
            verbose=verbose)

        assert game_over, str('Game not over but new game' +
                              ' beginning during training')

        if winner in [0, 1]:
            scores[winner] += 1

        # Save test games of agent1 against a Random Agent
        if bool(n_games_test):
            assert len(test_results) != 0, \
            'Agent1 has been tested but there is no result of that.'
            learning_results.append(
                [epoch, test_results[2], test_results[0], test_results[1]])

        # Next round
        start_idx = 1 - start_idx

    # Save Q-function of agent1
    np.savetxt(str('Models/' + filename + '.csv'), agent1.Q, delimiter=',')
    # Save stats for learning rate of agent1
    np.savetxt(str('Models/data/count_' + filename + '.csv'),
               agent1.count_state_action,
               delimiter=',')

    return learning_results
コード例 #11
0
ファイル: train.py プロジェクト: JRConti/TapnSwap-RL
    np.savetxt(str('Models/data/count_' + filename + '.csv'),
               agent1.count_state_action,
               delimiter=',')

    return learning_results


if __name__ == "__main__":

    train(n_epochs=5000,
          epsilon=0.6,
          gamma=1.0,
          load_model=None,
          filename='greedy0_6_vsSelf_test',
          random_opponent=False,
          n_games_test=0,
          freq_test=-1,
          n_skip_games=-1,
          verbose=False)

    agent1 = RLAgent()
    agent1.load_model('greedy0_2_vsRandomvsSelf')
    agent2 = RLAgent()
    agent2.load_model('greedy0_6_vsSelf_test')
    results = compare_agents(agent1,
                             agent2,
                             n_games=10,
                             time_limit=None,
                             verbose=False)
    print(results)
コード例 #12
0
def game_mngr():
    """
  Game manager, used for navigation among different choices 
  offered to user.
  """

    # Options
    command = options('PLAY', 'RULES', 'Tap 1 to play or 2 to read the rules')

    # Rules page
    if int(command) == 2:
        print_rules()
        # Go back
        print('Tap 1 to come back to the main menu\n')
        comeback = tap_valid_digits([1])
        if int(comeback):
            game_mngr()

    # Game page
    if int(command) == 1:
        # Options
        players = options('PLAYER',
                          'PLAYERS',
                          'How many players ?',
                          comeback=True)

        # Go back
        if int(players) == 0:
            game_mngr()

        # 2 players
        if int(players) == 2:

            # Ask players' name
            player1, player2 = input_names(n_players=2)

            # Init scores
            scores = [0, 0]

            # Games
            tapnswap = TapnSwap()
            over = False
            while not over:
                game_over, winner = game_1vs1(tapnswap, player1, player2)
                scores[winner] += 1
                if game_over:
                    # Display scores
                    restart = display_endgame(scores, player1, player2)
                    # Go back
                    if not restart:
                        over = True
                        game_mngr()

        # 1 player
        if int(players) == 1:

            # Options
            level = options('EASY',
                            'DIFFICULT',
                            'Which level ?',
                            comeback=True)

            # Go back
            if int(level) == 0:
                game_mngr()

            # Define agent
            elif int(level) == 1:
                agent = RandomAgent()  # easy
            else:
                # Load agent
                agent = RLAgent()
                agent.load_model('greedy0_2_vsRandomvsSelf')  # difficult

            # Ask player's name
            player = input_names(n_players=1)

            # Init scores
            scores = [0, 0]

            # Games
            tapnswap = TapnSwap()
            over = False
            while not over:
                game_over, winner = game_1vsAgent(tapnswap,
                                                  player,
                                                  agent,
                                                  greedy=False)
                scores[winner] += 1
                if game_over:
                    # Display scores
                    restart = display_endgame(scores, player, 'Computer')
                    # Go back
                    if not restart:
                        over = True
                        game_mngr()
コード例 #13
0
args = parser.parse_args()

# create world
world = World(args.config_file, thread_num=args.thread)

# create agents
agents = []
for i in world.intersections:
    action_space = gym.spaces.Discrete(len(i.phases))
    agents.append(
        RLAgent(
            action_space,
            LaneVehicleGenerator(world,
                                 i, ["lane_count"],
                                 in_only=True,
                                 average="road"),
            LaneVehicleGenerator(world,
                                 i, ["lane_waiting_count"],
                                 in_only=True,
                                 average="all",
                                 negative=True)))

# create metric
metric = TravelTimeMetric(world)

# create env
env = TSCEnv(world, agents, metric)

# simulate
obs = env.reset()
for i in range(args.steps):