def __init__(self, n_particles, game, game_params, gamma):
     self.gamma = gamma
     self.envs = []
     self.states = []
     for _ in n_particles:
         env = make_game(game, game_params)
         env.seed(np.random.randint(1e7))  # draw some Env seed
         s = env.reset()
         self.envs.append(env)
         self.states.append(s)
def train(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size,
          temp, n_hidden_layers, n_hidden_units):
    """ Outer training loop """

    episode_returns = []  # storage
    timepoints = []
    # Environments
    if game == "teaching":
        env = TeachingEnv()
        bootstrap_last_state_value = False
    else:
        env = make_game(game)
        bootstrap_last_state_value = True
    is_atari = is_atari_game(env)
    mcts_env = make_game(game) if is_atari else None

    replay_buffer = ReplayBuffer(max_size=data_size, batch_size=batch_size)
    model = Model(env=env,
                  lr=lr,
                  n_hidden_layers=n_hidden_layers,
                  n_hidden_units=n_hidden_units)
    t_total = 0  # total steps
    R_best = -np.Inf

    start_training = time.time()

    for ep in range(n_ep):
        start = time.time()
        s = env.reset()
        R = 0.0  # Total return counter
        a_store = []
        seed = np.random.randint(1e7)  # draw some Env seed
        env.seed(seed)
        if is_atari:
            mcts_env.reset()
            mcts_env.seed(seed)

        # the object responsible for MCTS searches
        mcts = MCTS(root_index=s,
                    root=None,
                    model=model,
                    na=model.action_dim,
                    gamma=gamma,
                    bootstrap_last_state_value=bootstrap_last_state_value)

        if game == "teaching":
            iterator = tqdm(range(env.t_max))
        else:
            iterator = range(max_ep_len)
        for _ in iterator:
            # MCTS step
            mcts.search(n_mcts=n_mcts, c=c, env=env,
                        mcts_env=mcts_env)  # perform a forward search
            state, pi, v = mcts.return_results(temp)  # extract the root output
            replay_buffer.store((state, v, pi))

            # Make the true step
            a = np.random.choice(len(pi), p=pi)
            a_store.append(a)
            s1, r, terminal, _ = env.step(a)
            R += r
            t_total += n_mcts  # total number of environment steps (counts the mcts steps)

            if terminal:
                break
            else:
                mcts.forward(a, s1)

        # Finished episode
        episode_returns.append(R)  # store the total episode return
        timepoints.append(
            t_total)  # store the timestep count of the episode return
        store_safely({'R': episode_returns, 't': timepoints})

        if R > R_best:
            a_best = a_store
            seed_best = seed
            R_best = R
        print(f'Finished episode {ep}, total return: {np.round(R,2)}, '
              f'time episode: {time.time()-start:.1f} sec, '
              f'total time since: {time.time()-start_training:.1f} sec')
        # Train
        replay_buffer.shuffle()
        for sb, vb, pib in replay_buffer:
            model.train_on_example(sb=sb, vb=vb, pib=pib)
    # Return results
    return episode_returns, timepoints, a_best, seed_best, R_best
Beispiel #3
0
def agent(game,
          n_ep,
          n_mcts,
          max_ep_len,
          lr,
          c,
          gamma,
          data_size,
          batch_size,
          temp,
          n_hidden_layers,
          n_hidden_units,
          stochastic=False,
          eval_freq=-1,
          eval_episodes=100,
          alpha=0.6,
          out_dir='../',
          pre_process=None,
          visualize=False):
    ''' Outer training loop '''
    if pre_process is not None:
        pre_process()

    # tf.reset_default_graph()

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    episode_returns = []  # storage
    timepoints = []
    # Environments
    Env = make_game(game)
    is_atari = is_atari_game(Env)
    mcts_env = make_game(game) if is_atari else None
    online_scores = []
    offline_scores = []
    mcts_params = dict(gamma=gamma)
    if stochastic:
        mcts_params['alpha'] = alpha
        mcts_maker = MCTSStochastic
    else:
        mcts_maker = MCTS

    D = Database(max_size=data_size, batch_size=batch_size)
    model = Model(Env=Env,
                  lr=lr,
                  n_hidden_layers=n_hidden_layers,
                  n_hidden_units=n_hidden_units)
    t_total = 0  # total steps
    R_best = -np.Inf

    with tf.Session() as sess:
        model.sess = sess
        sess.run(tf.global_variables_initializer())

        for ep in range(n_ep):
            if eval_freq > 0 and ep % eval_freq == 0:  #and ep > 0
                print(
                    'Evaluating policy for {} episodes!'.format(eval_episodes))
                seed = np.random.randint(1e7)  # draw some Env seed
                Env.seed(seed)
                s = Env.reset()
                mcts = mcts_maker(root_index=s,
                                  root=None,
                                  model=model,
                                  na=model.action_dim,
                                  **mcts_params)
                env_wrapper = EnvEvalWrapper()
                env_wrapper.mcts = mcts
                starting_states = []

                def reset_env():
                    s = Env.reset()
                    env_wrapper.mcts = mcts_maker(root_index=s,
                                                  root=None,
                                                  model=model,
                                                  na=model.action_dim,
                                                  **mcts_params)
                    starting_states.append(s)
                    if env_wrapper.curr_probs is not None:
                        env_wrapper.episode_probabilities.append(
                            env_wrapper.curr_probs)
                    env_wrapper.curr_probs = []
                    return s

                def forward(a, s, r):
                    env_wrapper.mcts.forward(a, s, r)
                    #pass

                env_wrapper.reset = reset_env
                env_wrapper.step = lambda x: Env.step(x)
                env_wrapper.forward = forward
                env_wrapper.episode_probabilities = []
                env_wrapper.curr_probs = None

                def pi_wrapper(ob):
                    if not is_atari:
                        mcts_env = None
                    env_wrapper.mcts.search(n_mcts=n_mcts,
                                            c=c,
                                            Env=Env,
                                            mcts_env=mcts_env)
                    state, pi, V = env_wrapper.mcts.return_results(temp=0)
                    #pi = model.predict_pi(s).flatten()
                    env_wrapper.curr_probs.append(pi)
                    a = np.argmax(pi)
                    return a

                rews, lens = eval_policy(pi_wrapper,
                                         env_wrapper,
                                         n_episodes=eval_episodes,
                                         verbose=True)
                offline_scores.append([
                    np.min(rews),
                    np.max(rews),
                    np.mean(rews),
                    np.std(rews),
                    len(rews),
                    np.mean(lens)
                ])
                # if len(rews) < eval_episodes or len(rews) == 0:
                #     print("WTF")
                # if np.std(rews) == 0.:
                #     print("WTF 2")
                np.save(out_dir + '/offline_scores.npy', offline_scores)
            start = time.time()
            s = Env.reset()
            R = 0.0  # Total return counter
            a_store = []
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            if is_atari:
                mcts_env.reset()
                mcts_env.seed(seed)
            if ep % eval_freq == 0:
                print("Collecting %d episodes" % eval_freq)
            mcts = mcts_maker(
                root_index=s,
                root=None,
                model=model,
                na=model.action_dim,
                **mcts_params)  # the object responsible for MCTS searches
            for t in range(max_ep_len):
                # MCTS step
                if not is_atari:
                    mcts_env = None
                mcts.search(n_mcts=n_mcts, c=c, Env=Env,
                            mcts_env=mcts_env)  # perform a forward search
                if visualize:
                    mcts.visualize()
                state, pi, V = mcts.return_results(
                    temp)  # extract the root output
                D.store((state, V, pi))

                # Make the true step
                a = np.random.choice(len(pi), p=pi)
                a_store.append(a)
                s1, r, terminal, _ = Env.step(a)
                R += r
                t_total += n_mcts  # total number of environment steps (counts the mcts steps)

                if terminal:
                    break
                else:
                    mcts.forward(a, s1, r)

            # Finished episode
            episode_returns.append(R)  # store the total episode return
            online_scores.append(R)
            timepoints.append(
                t_total)  # store the timestep count of the episode return
            store_safely(out_dir, 'result', {
                'R': episode_returns,
                't': timepoints
            })
            np.save(out_dir + '/online_scores.npy', online_scores)
            # print('Finished episode {}, total return: {}, total time: {} sec'.format(ep, np.round(R, 2),
            #                                                                          np.round((time.time() - start),
            #                                                                                   1)))

            if R > R_best:
                a_best = a_store
                seed_best = seed
                R_best = R

            # Train
            D.reshuffle()
            try:
                for epoch in range(1):
                    for sb, Vb, pib in D:
                        model.train(sb, Vb, pib)
            except Exception as e:
                print("ASD")
            model.save(out_dir + 'model')
    # Return results
    return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
if not args.gpu:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Accept custom grid if the environment requires it
if args.game == 'Taxi' or args.game == 'TaxiEasy':
    game_params['grid'] = args.grid
    game_params['box'] = True
if args.game == 'RaceStrategy-v0':
    game_params['horizon'] = args.horizon

basedir = args.logdir
ts = str(time.time())
logdir = basedir + args.game + '/' + ts + '/'
if not os.path.exists(logdir):
    os.makedirs(logdir)
env = make_game(args.game, game_params)
state_dim = env.observation_space.shape[0]

discrete = True
try:
    action_dim = env.action_space.n
except:
    action_dim = env.action_space.high.shape[0]
    discrete = False
pi = load_policy(model_path='',
                 input_dim=state_dim,
                 output_dim=action_dim,
                 num_hidden=args.n_hidden_units,
                 num_layers=args.n_hidden_layers,
                 discrete=discrete,
                 beta=1.0)
Beispiel #5
0
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size,
          temp, n_hidden_layers, n_hidden_units):
    ''' Outer training loop '''
    # tf.reset_default_graph()
    episode_returns = []  # storage
    timepoints = []
    # Environments
    Env = make_game(game)
    is_atari = is_atari_game(Env)
    mcts_env = make_game(game) if is_atari else None

    D = Database(max_size=data_size, batch_size=batch_size)
    model = Model(Env=Env,
                  lr=lr,
                  n_hidden_layers=n_hidden_layers,
                  n_hidden_units=n_hidden_units)
    t_total = 0  # total steps
    R_best = -np.Inf

    with tf.Session() as sess:
        model.sess = sess
        sess.run(tf.global_variables_initializer())
        for ep in range(n_ep):
            start = time.time()
            s = Env.reset()
            R = 0.0  # Total return counter
            a_store = []
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            if is_atari:
                mcts_env.reset()
                mcts_env.seed(seed)

            mcts = MCTS(
                root_index=s,
                root=None,
                model=model,
                na=model.action_dim,
                gamma=gamma)  # the object responsible for MCTS searches
            for t in range(max_ep_len):
                # MCTS step
                mcts.search(n_mcts=n_mcts, c=c, Env=Env,
                            mcts_env=mcts_env)  # perform a forward search
                state, pi, V = mcts.return_results(
                    temp)  # extract the root output
                D.store((state, V, pi))

                # Make the true step
                a = np.random.choice(len(pi), p=pi)
                a_store.append(a)
                s1, r, terminal, _ = Env.step(a)
                R += r
                t_total += n_mcts  # total number of environment steps (counts the mcts steps)

                if terminal:
                    break
                else:
                    mcts.forward(a, s1)

            # Finished episode
            episode_returns.append(R)  # store the total episode return
            timepoints.append(
                t_total)  # store the timestep count of the episode return
            store_safely(os.getcwd(), 'result', {
                'R': episode_returns,
                't': timepoints
            })

            if R > R_best:
                a_best = a_store
                seed_best = seed
                R_best = R
            print('Finished episode {}, total return: {}, total time: {} sec'.
                  format(ep, np.round(R, 2), np.round((time.time() - start),
                                                      1)))
            # Train
            D.reshuffle()
            for epoch in range(1):
                for sb, Vb, pib in D:
                    model.train(sb, Vb, pib)
    # Return results
    return episode_returns, timepoints, a_best, seed_best, R_best
Beispiel #6
0
def agent(game,
          n_ep,
          n_mcts,
          max_ep_len,
          lr,
          c,
          gamma,
          data_size,
          batch_size,
          temp,
          n_hidden_layers,
          n_hidden_units,
          stochastic=False,
          eval_freq=-1,
          eval_episodes=100,
          alpha=0.6,
          n_epochs=100,
          c_dpw=1,
          numpy_dump_dir='../',
          pre_process=None,
          visualize=False,
          game_params={},
          parallelize_evaluation=False,
          mcts_only=False,
          particles=0,
          show_plots=False,
          n_workers=1,
          use_sampler=False,
          budget=np.inf,
          unbiased=False,
          biased=False,
          max_workers=100,
          variance=False,
          depth_based_bias=False,
          scheduler_params=None,
          out_dir=None,
          render=False,
          second_version=False,
          third_version=False):
    visualizer = None

    # if particles:
    #     parallelize_evaluation = False  # Cannot run parallelized evaluation with particle filtering

    if not mcts_only:
        from mcts import MCTS
        from mcts_dpw import MCTSStochastic
    elif particles:
        if unbiased:
            from particle_filtering.ol_uct import OL_MCTS
        elif biased:
            if second_version:
                from particle_filtering.pf_uct_2 import PFMCTS2 as PFMCTS
            elif third_version:
                from particle_filtering.pf_uct_3 import PFMCTS3 as PFMCTS
            else:
                from particle_filtering.pf_uct import PFMCTS
        else:
            from particle_filtering.pf_mcts_edo import PFMCTS
    else:
        from pure_mcts.mcts import MCTS
        from pure_mcts.mcts_dpw import MCTSStochastic

    if parallelize_evaluation:
        print("The evaluation will be parallel")

    parameter_list = {
        "game": game,
        "n_ep": n_ep,
        "n_mcts": n_mcts,
        "max_ep_len": max_ep_len,
        "lr": lr,
        "c": c,
        "gamma": gamma,
        "data_size": data_size,
        "batch_size": batch_size,
        "temp": temp,
        "n_hidden_layers": n_hidden_layers,
        "n_hidden_units": n_hidden_units,
        "stochastic": stochastic,
        "eval_freq": eval_freq,
        "eval_episodes": eval_episodes,
        "alpha": alpha,
        "n_epochs": n_epochs,
        "out_dir": numpy_dump_dir,
        "pre_process": pre_process,
        "visualize": visualize,
        "game_params": game_params,
        "n_workers": n_workers,
        "use_sampler": use_sampler,
        "variance": variance,
        "depth_based_bias": depth_based_bias,
        "unbiased": unbiased,
        "second_version": second_version,
        'third_version': third_version
    }
    if out_dir is not None:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        with open(os.path.join(out_dir, "parameters.txt"), 'w') as d:
            d.write(json.dumps(parameter_list))
    #logger = Logger(parameter_list, game, show=show_plots)

    if DEBUG_TAXI:
        from utils.visualization.taxi import TaxiVisualizer
        with open(game_params["grid"]) as f:
            m = f.readlines()
            matrix = []
            for r in m:
                row = []
                for ch in r.strip('\n'):
                    row.append(ch)
                matrix.append(row)
            visualizer = TaxiVisualizer(matrix)
            f.close()
            exit()
    ''' Outer training loop '''
    if pre_process is not None:
        pre_process()

    # numpy_dump_dir = logger.numpy_dumps_dir
    #
    # if not os.path.exists(numpy_dump_dir):
    #     os.makedirs(numpy_dump_dir)

    episode_returns = []  # storage
    timepoints = []

    # Environments
    if game == 'Trading-v0':
        game_params['save_dir'] = out_dir  #logger.save_dir
    Env = make_game(game, game_params)
    num_actions = Env.action_space.n
    sampler = None
    if use_sampler and not (unbiased or biased):

        def make_pi(action_space):
            def pi(s):
                return np.random.randint(low=0, high=action_space.n)

            return pi

        def make_env():
            return make_game(game, game_params)

        sampler = ParallelSampler(make_pi=make_pi,
                                  make_env=make_env,
                                  n_particles=particles,
                                  n_workers=n_workers,
                                  seed=10)

    is_atari = is_atari_game(Env)
    mcts_env = make_game(game, game_params) if is_atari else None
    online_scores = []
    offline_scores = []

    # Setup the parameters for generating the search environments

    if game == "RaceStrategy-v1":
        mcts_maker, mcts_params, c_dpw = load_race_agents_config(
            'envs/configs/race_strategy_full.json', gamma)

    else:
        mcts_params = dict(gamma=gamma)
        if particles:
            if not (biased or unbiased):
                mcts_params['particles'] = particles
                mcts_params['sampler'] = sampler
            elif biased:
                mcts_params['alpha'] = alpha
                mcts_maker = PFMCTS

            mcts_params['depth_based_bias'] = depth_based_bias
            if unbiased:
                mcts_params['variance'] = variance
                mcts_maker = OL_MCTS

        elif stochastic:
            mcts_params['alpha'] = alpha
            mcts_params['depth_based_bias'] = depth_based_bias
            mcts_maker = MCTSStochastic
        else:
            mcts_maker = MCTS

    # Prepare the database for storing training data to be sampled
    db = Database(max_size=data_size, batch_size=batch_size)

    # TODO extract dimensions to avoid allocating model
    # Setup the model
    model_params = {
        "Env": Env,
        "lr": lr,
        "n_hidden_layers": n_hidden_layers,
        "n_hidden_units": n_hidden_units,
        "joint_networks": True
    }

    model_wrapper = ModelWrapper(**model_params)

    t_total = 0  # total steps
    R_best = -np.Inf
    a_best = None
    seed_best = None

    # Variables for storing values to be plotted
    avgs = []
    stds = []

    # Run the episodes
    for ep in range(n_ep):

        if DEBUG_TAXI:
            visualizer.reset()

        ##### Policy evaluation step #####
        if eval_freq > 0 and ep % eval_freq == 0:  # and ep > 0
            print(
                '--------------------------------\nEvaluating policy for {} episodes!'
                .format(eval_episodes))
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            s = Env.reset()

            if parallelize_evaluation:
                penv = None
                pgame = {
                    "game_maker": make_game,
                    "game": game,
                    "game_params": game_params
                }
            else:
                penv = Env
                pgame = None

            model_file = os.path.join(out_dir, "model.h5")

            # model_wrapper.save(model_file)

            if game == "RaceStrategy-v1":
                env_wrapper = RaceWrapper(s,
                                          mcts_maker,
                                          model_file,
                                          model_params,
                                          mcts_params,
                                          is_atari,
                                          n_mcts,
                                          budget,
                                          mcts_env,
                                          c_dpw,
                                          temp,
                                          env=penv,
                                          game_maker=pgame,
                                          mcts_only=mcts_only,
                                          scheduler_params=scheduler_params)
            else:
                env_wrapper = Wrapper(s,
                                      mcts_maker,
                                      model_file,
                                      model_params,
                                      mcts_params,
                                      is_atari,
                                      n_mcts,
                                      budget,
                                      mcts_env,
                                      c_dpw,
                                      temp,
                                      env=penv,
                                      game_maker=pgame,
                                      mcts_only=mcts_only,
                                      scheduler_params=scheduler_params)

            # Run the evaluation
            if parallelize_evaluation:
                total_reward, reward_per_timestep, lens, action_counts = \
                    parallelize_eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len,
                                            max_workers=max_workers, out_dir=out_dir)
            else:
                total_reward, reward_per_timestep, lens, action_counts = \
                    eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len,
                                visualize=visualize, out_dir=out_dir, render=render)

            # offline_scores.append([np.min(rews), np.max(rews), np.mean(rews), np.std(rews),
            #                        len(rews), np.mean(lens)])

            offline_scores.append(
                [total_reward, reward_per_timestep, lens, action_counts])

            #np.save(numpy_dump_dir + '/offline_scores.npy', offline_scores)

            # Store and plot data
            avgs.append(np.mean(total_reward))
            stds.append(np.std(total_reward))

            #logger.plot_evaluation_mean_and_variance(avgs, stds)

        ##### Policy improvement step #####

        if not mcts_only:

            start = time.time()
            s = start_s = Env.reset()
            R = 0.0  # Total return counter
            a_store = []
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            if is_atari:
                mcts_env.reset()
                mcts_env.seed(seed)

            if eval_freq > 0 and ep % eval_freq == 0:
                print("\nCollecting %d episodes" % eval_freq)
            mcts = mcts_maker(
                root_index=s,
                root=None,
                model=model_wrapper,
                na=model_wrapper.action_dim,
                **mcts_params)  # the object responsible for MCTS searches

            print("\nPerforming MCTS steps\n")

            ep_steps = 0
            start_targets = []

            for st in range(max_ep_len):

                print_step = max_ep_len // 10
                if st % print_step == 0:
                    print('Step ' + str(st + 1) + ' of ' + str(max_ep_len))

                # MCTS step
                if not is_atari:
                    mcts_env = None
                mcts.search(n_mcts=n_mcts, c=c, Env=Env,
                            mcts_env=mcts_env)  # perform a forward search

                if visualize:
                    mcts.visualize()

                state, pi, V = mcts.return_results(
                    temp)  # extract the root output

                # Save targets for starting state to debug
                if np.array_equal(start_s, state):
                    if DEBUG:
                        print("Pi target for starting state:", pi)
                    start_targets.append((V, pi))
                db.store((state, V, pi))

                # Make the true step
                a = np.random.choice(len(pi), p=pi)
                a_store.append(a)

                s1, r, terminal, _ = Env.step(a)

                # Perform command line visualization if necessary
                if DEBUG_TAXI:
                    olds, olda = copy.deepcopy(s1), copy.deepcopy(a)
                    visualizer.visualize_taxi(olds, olda)
                    print("Reward:", r)

                R += r
                t_total += n_mcts  # total number of environment steps (counts the mcts steps)
                ep_steps = st + 1

                if terminal:
                    break  # Stop the episode if we encounter a terminal state
                else:
                    mcts.forward(a, s1, r)  # Otherwise proceed

            # Finished episode
            if DEBUG:
                print("Train episode return:", R)
                print("Train episode actions:", a_store)
            episode_returns.append(R)  # store the total episode return
            online_scores.append(R)
            timepoints.append(
                t_total)  # store the timestep count of the episode return
            #store_safely(numpy_dump_dir, '/result', {'R': episode_returns, 't': timepoints})
            #np.save(numpy_dump_dir + '/online_scores.npy', online_scores)

            if DEBUG or True:
                print(
                    'Finished episode {} in {} steps, total return: {}, total time: {} sec'
                    .format(ep, ep_steps, np.round(R, 2),
                            np.round((time.time() - start), 1)))
            # Plot the online return over training episodes

            #logger.plot_online_return(online_scores)

            if R > R_best:
                a_best = a_store
                seed_best = seed
                R_best = R

            print()

            # Train only if the model has to be used
            if not mcts_only:
                # Train
                try:
                    print("\nTraining network")
                    ep_V_loss = []
                    ep_pi_loss = []

                    for _ in range(n_epochs):
                        # Reshuffle the dataset at each epoch
                        db.reshuffle()

                        batch_V_loss = []
                        batch_pi_loss = []

                        # Batch training
                        for sb, Vb, pib in db:

                            if DEBUG:
                                print("sb:", sb)
                                print("Vb:", Vb)
                                print("pib:", pib)

                            loss = model_wrapper.train(sb, Vb, pib)

                            batch_V_loss.append(loss[1])
                            batch_pi_loss.append(loss[2])

                        ep_V_loss.append(mean(batch_V_loss))
                        ep_pi_loss.append(mean(batch_pi_loss))

                    # Plot the loss over training epochs

                    #logger.plot_loss(ep, ep_V_loss, ep_pi_loss)

                except Exception as e:
                    print("Something wrong while training:", e)

                # model.save(out_dir + 'model')

                # Plot the loss over different episodes
                #logger.plot_training_loss_over_time()

                pi_start = model_wrapper.predict_pi(start_s)
                V_start = model_wrapper.predict_V(start_s)

                print("\nStart policy: ", pi_start)
                print("Start value:", V_start)

                #logger.log_start(ep, pi_start, V_start, start_targets)

    # Return results
    if use_sampler:
        sampler.close()
    return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
Beispiel #7
0
 def make_env():
     return make_game(game, game_params)
    def __init__(self, hps, cluster_spec=None):
        self.hps = hps

        if hps.distributed:
            ps_servers = [
                "/job:ps/task:{}".format(ps_num) for ps_num in range(hps.n_ps)
            ]
            config = tf.ConfigProto(
                device_filters=ps_servers +
                ['/job:ps', '/job:worker/task:{}'.format(hps.job_index)],
                #device_count={"CPU": hps.num_agents, "GPU" : 0},
                #allow_soft_placement=True,
                inter_op_parallelism_threads=2,
                intra_op_parallelism_threads=1,
                log_device_placement=False)
            server = tf.train.Server(cluster_spec,
                                     config=config,
                                     job_name="worker",
                                     task_index=hps.job_index)
        else:
            server, config = None, None

        # Environment
        Env = make_game(hps.game)
        try:
            hps.max_ep_len = Env._max_episode_steps - 1
            logger.info('Set max steps per episode to {}'.format(
                hps.max_ep_len))
        except:
            logger.info(
                'Environment does not have a time limit wrapper, using {} max steps per episode'
                .format(hps.max_ep_len))

        hps.action_dim, hps.action_discrete = check_space(Env.action_space)
        hps.state_dim, hps.state_discrete = check_space(Env.observation_space)
        if not hps.action_discrete:
            raise ValueError('Continuous action space not implemented')

        # Seed
        seed = np.random.randint(1e8) + 7 * hps.job_index
        np.random.seed(seed)
        random.seed(seed)

        # Network
        model, target_model, copy_op, global_model, global_target_model, global_copy_op, sync_op = make_network(
            hps, cluster_spec)
        logger.info('Total number of trainable parameters {} million'.format(
            model_description(model.var_list) / (1e6)))
        if not hps.distributed:
            with tf.Session() as sess:
                logger.info('Initializing ..')
                sess.run(tf.global_variables_initializer())
                run(Env, hps, sess, model, target_model, copy_op, global_model,
                    global_target_model, global_copy_op, sync_op)
        else:
            # make init op
            global_init_op, local_init_op, global_variables = make_init_ops()
            sv = tf.train.Supervisor(
                is_chief=(hps.job_index == 0),
                init_op=global_init_op,
                local_init_op=local_init_op,
                ready_op=tf.report_uninitialized_variables(
                    tf.global_variables()),
                #logdir = hps.result_dir,
            )
            #print('Im worker {} before the supervisor'.format(hps.job_index))

            with sv.managed_session(server.target,
                                    config=config) as sess, sess.as_default():
                #print('Im worker {} after the supervisor'.format(hps.job_index))
                sess.run(sync_op)

                run(Env, hps, sess, model, target_model, copy_op, global_model,
                    global_target_model, global_copy_op, sync_op, sv)

            sv.stop()
def run(Env,
        hps,
        sess,
        model,
        target_model=None,
        copy_op=None,
        global_model=None,
        global_target_model=None,
        global_copy_op=None,
        sync_op=None,
        sv=None):
    begin = overall_begin = time.time()
    print('Im worker {} and starting'.format(hps.job_index))

    # Database
    D = Database(data_size=hps.n_ep_collect * hps.max_ep_len *
                 hps.n_rep_target * 2,
                 batch_size=hps.batch_size,
                 entry_type='sequential')
    if hps.replay_size > 0:
        D_sars = Replay(max_size=hps.replay_size,
                        prioritized_frac=hps.prioritized_frac)
        #D_sars = Database(data_size=hps.replay_size, batch_size=hps.batch_size,entry_type='sequential')

    #saver = tf.train.Saver(max_to_keep=10)

    # Counters
    Copy_count = Interval_checker(hps.max_ep, hps.t_max, hps.store_copy_freq)
    Eval_count = Interval_checker(hps.max_ep, hps.t_max, hps.evaluate_freq)
    if hps.game == 'Toy' and hps.visualize:
        from rl.envs.toy import ToyPlotter, ToyDomainPlotter
        toy_plotter = ToyPlotter()
        toy_domain_plotter = ToyDomainPlotter(hps)
    elif 'Chain' in hps.game and hps.visualize:
        from rl.envs.chain import ChainPlotter, ChainDomainPlotter
        chain_plotter = ChainPlotter(Env.correct, n_plot=Env.n)
        chain_domain_plotter = ChainDomainPlotter(Env)
    # Discretizer = TransformDiscrete(hps.disc_n,hps.disc_min,hps.disc_max)
    # ema = EMA()
    if 'Chain' in hps.game:
        # necessary for plotting visitation counts correctly
        test_Env = make_game(hps.game)
        test_Env.correct = Env.correct
    else:
        test_Env = Env

    t = 0
    ep = 0
    log_count = 0
    time_result = TimedResults(
        n=6)  # Average Reward, Empirical Loss, Qsa_norm, grad_norm, loss
    #epsilon = AnnealLinear(hps.e_init,hps.e_final,int(hps.anneal_frac*hps.t_max))

    time_check = 't < hps.t_max' if not hps.distributed else '(t < hps.t_max) and not sv.should_stop()'
    running_mean = 0.0
    frac = 0.97

    while eval(time_check):  # train loop
        now = time.time()
        if hps.distributed:
            sess.run(sync_op)
        e = sess.run(model.epsilon)

        # Collect data
        t_new, t_, av_R, ep_R, data = collect_data(hps, model, sess, Env, e)

        # Process new (on-policy) data
        D.clear()  # clear training database
        D, D_sars, Qsa_norm, Qsa_sds = calculate_targets(
            data,
            D,
            D_sars,
            model,
            sess,
            hps,
            hps.lambda_,
            off_policy=hps.off_policy,
            target_model=target_model)

        # Fill up database from replay
        if D_sars.size > hps.min_replay_size:
            extra_needed = hps.replay_frac * D.size
            more = hps.batch_size - (D.size + extra_needed) % hps.batch_size
            if more > 0:
                extra_needed += more  # to make full minibatches from rollout
            if (extra_needed > 0) and (extra_needed <= D_sars.size):
                # draw the samples
                sb, ab, rb, s1b, tb = D_sars.sample_random_batch(
                    extra_needed, True)  # sample from replay
                D, D_sars, Qsa_norm2 = calculate_targets_off_policy(
                    sb, ab, rb, s1b, tb, D, D_sars, model, sess, hps,
                    target_model)  # Process the extra data (off-policy)

        # Train
        gradient_norm, clipped_norm, loss = train(hps, model, sess, D)

        #        # Put new data in replay database
        #        if hps.replay_size > 0:
        #            for rollout in data:
        #                si,ai,ri,st = rollout.extract()
        #                tt = np.zeros(si.shape[0])
        #                tt[-1] = rollout.terminal
        #                si1 = np.concatenate([si[1:,],st[None,:]],axis=0)
        #                D_sars.store_from_array(si,ai,ri,si1,tt)

        #        # Send data to replay database
        #        if hps.replay_frac > 0:
        #            if hps.replay_size > 0:
        #                for j,rollout_data in enumerate(data):
        #                    rollout_data.seed = None # seed becomes irrelevant
        #                    D_replay.put(rollout_data,priority=prios)#-1.0*prios[j])
        #
        #            if hps.replay_size > 0 and len(D_replay.q)>(hps.replay_frac*len(data)*2):
        #                replay_data = D_replay.get(hps.replay_frac*len(data))
        #                # replay always off-policy (lambda_=0.0) for sample-based loss
        #                D,_ = calculate_targets(replay_data,D,model,sess,hps,lambda_=0.0,off_policy=hps.off_policy,target_model=target_model)
        #                new_prios = 0.0
        #                #D,_,new_prios = process(replay_data,D,model,sess,hps,target_model=target_model,ema=ema)
        #                train(hps,model,sess,D)
        #                D.clear()
        #                if hps.prioritized_frac>0.0:
        #                    for i in range(len(replay_data)):
        #                        D_replay.put(replay_data[i],priority=-1.0*new_prios[i])

        # Counters
        _, _, t, ep = sess.run(
            [model.inc_t, model.inc_ep, model.global_t, model.global_ep],
            feed_dict={
                model.t: t_new,
                model.ep: len(ep_R)
            })

        if hps.level == 'debug':
            memory_use = resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss / 1024
            print(
                'Process {} = worker {} loops in {} seconds, memory use {} Mb for replay size {}'
                .format(hps.agent_index, hps.job_index,
                        time.time() - now, memory_use, D_sars.size))
            #else:
            #    print('Memory use {} Mb'.format(memory_use))

        if (not hps.distributed) or (hps.distributed and hps.job_index == 0):
            # Store
            episode_reward, average_reward = np.mean(ep_R), np.mean(av_R)
            time_result.add([ep - (len(ep_R) / 2)], [average_reward],
                            [episode_reward], [np.mean(Qsa_norm)],
                            [gradient_norm], [loss], [int(t - t_new / 2)])
            #logger.info('Evaluated: Ep {:5d}, t {:5d}: Ep return {:4.2f}, Running mean {:4.2f}, Qsa_norm {:4.2f}, grad_norm {:4.2f}, clipped_norm {:4.2f}, loss {:4.2f},  episode_length {:3.1f}'.format(ep,t,episode_reward,running_mean,np.mean(Qsa_norm),gradient_norm,clipped_norm,loss,np.mean(t_)))
            ep_curve, av_R_curve, ep_R_curve, Qsa_norm_curve, grad_norm_curve, loss_curve, t_curve = time_result.extract(
            )
            downsample_store(hps,
                             ep_curve,
                             ep_R_curve,
                             av_R_curve,
                             Qsa_norm_curve,
                             grad_norm_curve,
                             loss_curve,
                             t_curve,
                             out_len=1000)

            # Logging
            if (t // hps.log_interval > log_count) and (not hps.slurm):
                running_mean = frac * running_mean + (1 - frac) * np.mean(ep_R)
                logger.info(
                    'Ep {:4d}, t {:5d}: Ep Return {:4.2f}, Run_mean {:4.2f}, Qsa_mean {:4.2f}, Qsa_sd {:4.2f}, grad_norm {:4.2f}, clip_norm {:4.2f}, loss {:4.2f}, ep_len {:3.1f}'
                    .format(ep, t, np.mean(ep_R), running_mean,
                            np.mean(Qsa_norm), Qsa_sds, gradient_norm,
                            clipped_norm, loss, np.mean(t_)))
                log_count += 1

            # Copy target network
            if Copy_count.should_update(t, ep):
                # Target net
                if hps.target_net:
                    if hps.distributed:
                        sess.run(global_copy_op)
                    else:
                        sess.run(copy_op)

            # for uncer == 'log_bay' --> set new uncertainty estimates
            # sample larger X and Y batch:
            if hps.uncer == 'log_bay':
                D.clear()
                log_bay_sample = np.min([D_sars.size, 3000])
                sb, ab, rb, s1b, tb = D_sars.sample_random_batch(
                    log_bay_sample, True)  # sample from replay
                D, D_sars, Qsa_norm2 = calculate_targets_off_policy(
                    sb, ab, rb, s1b, tb, D, D_sars, model, sess, hps,
                    target_model)  # Process the extra data (off-policy)
                s_batch, a_batch, y_batch = D.sample_random_batch(
                    log_bay_sample)
                seed = [np.random.randint(1e15),
                        np.random.randint(1e15)
                        ]  # just need to feed something new for each batch
                feed_dict = {
                    model.x: s_batch,
                    model.a: a_batch,
                    model.y: y_batch,
                    model.seed: seed
                }
                sess.run(model.lin_bay_update, feed_dict=feed_dict)

            # Visualize for Toy/Chain
            if hps.game == 'Toy' and hps.visualize:
                if Eval_count.should_update(t, ep):
                    toy_plotter.update(sess, model, hps, ep)
                    toy_domain_plotter.update(Env.counts)
            elif 'Chain' in hps.game and hps.visualize:
                if Eval_count.should_update(t, ep):
                    chain_plotter.update(sess, model, hps, ep)
                    chain_domain_plotter.update(Env.counts)
            elif hps.visualize:
                if Eval_count.should_update(t, ep):
                    episode_reward, average_reward = evaluate(
                        test_Env, hps, model, sess)

            # Store model
            #if rep == 0:
            # saver.save(sess,hps.result_dir+make_name('',hps.item1,item1,hps.item2,item2,hps.item3,item3,hps.item4,item4)+'model.ckpt')


#            if Eval_count.should_update(t,ep):
#                if hps.distributed:
#                    sess.run(sync_op)
#                episode_reward,average_reward = evaluate(test_Env,hps,model,sess)
#                time_result.add([ep - (len(ep_R)/2)],[average_reward],[episode_reward],[np.mean(ep_R)],[np.mean(Qsa_norm)],[gradient_norm],[loss])
#                logger.info('Evaluated: Ep {:5d}, t {:5d}: Ep return {:4.2f}, Running mean {:4.2f}, Qsa_norm {:4.2f}, grad_norm {:4.2f}, clipped_norm {:4.2f}, loss {:4.2f},  episode_length {:3.1f}'.format(ep,t,episode_reward,running_mean,np.mean(Qsa_norm),gradient_norm,clipped_norm,loss,np.mean(t_)))
#                ep_curve,av_R_curve,ep_R_curve,Qsa_norm_curve,grad_norm_curve,loss_curve = time_result.extract()
#                downsample_store(hps,ep_curve,ep_R_curve,av_R_curve,Qsa_norm_curve,grad_norm_curve,loss_curve,out_len=1000)

        if (ep > hps.max_ep) or (t > hps.t_max):
            if hps.distributed:
                sv.request_stop()
            break  # max number of episodes overrules max number of steps
        elapsed = (time.time() - overall_begin) / 60
        logger.info(
            'Reached {} episodes in {} timesteps, took {} hours = {} minutes'.
            format(ep, t, elapsed / 60, elapsed))

    if (not hps.distributed) or (hps.disributed and hps.job_index == 0):
        ep_curve, av_R_curve, ep_R_curve, Qsa_norm_curve, grad_norm_curve, loss_curve, t_curve = time_result.extract(
        )
        downsample_store(hps,
                         ep_curve,
                         ep_R_curve,
                         av_R_curve,
                         Qsa_norm_curve,
                         grad_norm_curve,
                         loss_curve,
                         t_curve,
                         out_len=1000)

        # Compute best result (last 10%)
        fraction = 0.1
        save_from_index = int(np.ceil(len(ep_curve) * fraction))
        ep_R_best = np.mean(ep_R_curve[save_from_index:])
        av_R_best = np.mean(av_R_curve[save_from_index:])
        np.savetxt(hps.result_dir + 'best_results.txt',
                   np.append(ep_R_best, av_R_best),
                   fmt='%.3g')
Beispiel #10
0
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size,
          temp, n_hidden_layers, n_hidden_units):
    """ Outer training loop """
    seed_best = None
    a_best = None
    episode_returns = []  # storage
    timepoints = []
    # environments
    env = make_game(game)
    is_atari = is_atari_game(env)
    mcts_env = make_game(game) if is_atari else None

    database = Database(max_size=data_size, batch_size=batch_size)
    model = Model(env=env,
                  lr=lr,
                  n_hidden_layers=n_hidden_layers,
                  n_hidden_units=n_hidden_units)
    t_total = 0  # total steps
    r_best = -np.Inf

    for ep in range(n_ep):
        start = time.time()
        s = env.reset()
        r2 = 0.0  # Total return counter
        a_store = []
        seed = np.random.randint(1e7)  # draw some env seed
        env.seed(seed)
        if is_atari:
            mcts_env.reset()
            mcts_env.seed(seed)

        mcts = MCTS(root_index=s,
                    model=model,
                    na=model.action_dim,
                    gamma=gamma)  # the object responsible for MCTS searches
        for t in range(max_ep_len):
            # MCTS step
            mcts.search(n_mcts=n_mcts, c=c, env=env,
                        mcts_env=mcts_env)  # perform a forward search
            state, pi, v = mcts.return_results(temp)  # extract the root output
            database.store((state, v, pi))

            # Make the true step
            a = np.random.choice(len(pi), p=pi)
            a_store.append(a)
            s1, r, terminal, _ = env.step(a)
            r2 += r
            # total number of environment steps (counts the mcts steps)
            t_total += n_mcts

            if terminal:
                break
            else:
                mcts.forward(a, s1)

        # Finished episode
        episode_returns.append(r2)  # store the total episode return
        timepoints.append(
            t_total)  # store the timestep count of the episode return
        store_safely(os.getcwd(), 'result', {
            'r': episode_returns,
            't': timepoints
        })

        if r2 > r_best:
            a_best = a_store
            seed_best = seed
            r_best = r2
        print(
            'Finished episode {}, total return: {}, total time: {} sec'.format(
                ep, np.round(r2, 2), np.round((time.time() - start), 1)))
        # Train
        database.reshuffle()
        for epoch in range(1):
            for sb, v_batch, pi_batch in database:
                model.train(sb, v_batch, pi_batch)
    # return results
    return episode_returns, timepoints, a_best, seed_best, r_best
def train_trpo(game, num_timesteps, eval_episodes, seed, horizon, out_dir='.', load_path=None, checkpoint_path_in=None,
               gamma=0.99, timesteps_per_batch=500, num_layers=0, num_hidden=32, checkpoint_freq=20, max_kl=0.01):
    start_time = time.time()
    clip = None
    dir = 'game'
    game_params = {}

    # Accept custom grid if the environment requires it
    if game == 'Taxi' or game == 'TaxiEasy':
        game_params['grid'] = args.grid
        game_params['box'] = True
    if game in ['RaceStrategy-v0', 'Cliff-v0']:
        game_params['horizon'] = horizon

    # env = Race(gamma=gamma, horizon=horizon, )
    # env_eval = Race(gamma=gamma, horizon=horizon)
    env = make_game(args.game, game_params)
    env_eval = make_game(args.game, game_params)
    directory_output = (dir + '/trpo_' + str(num_layers) + '_'+ str(num_hidden) + '_'+  str(max_kl) + '/')

    def eval_policy_closure(**args):
        return eval_policy(env=env_eval, gamma=gamma, **args)

    tf.set_random_seed(seed)
    sess = U.single_threaded_session()
    sess.__enter__()
    rank = MPI.COMM_WORLD.Get_rank()
    time_str = str(start_time)
    if rank == 0:
        logger.configure(dir=out_dir + '/' + directory_output + '/logs',
                         format_strs=['stdout', 'csv'], suffix=time_str)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    network = mlp(num_hidden=num_hidden, num_layers=num_layers)


    optimized_policy = trpo_mpi.learn(network=network, env=env, eval_policy=eval_policy_closure, timesteps_per_batch=timesteps_per_batch,
                   max_kl=max_kl, cg_iters=10, cg_damping=1e-3,
                   total_timesteps=num_timesteps, gamma=gamma, lam=1.0, vf_iters=3, vf_stepsize=1e-4,
                   checkpoint_freq=checkpoint_freq,
                   checkpoint_dir_out=out_dir + '/' + directory_output + '/models/' + time_str + '/',
                   load_path=load_path, checkpoint_path_in=checkpoint_path_in,
                   eval_episodes=eval_episodes,
                   init_std=1,
                   trainable_variance=True,
                   trainable_bias=True,
                   clip=clip)

    s = env.reset()
    done = False

    states = []
    actions = []
    s = 0
    delta_state = 0.2
    while s < env.dim[0]:
        a, _, _, _ = optimized_policy.step([s])
        states.append(s)
        actions.append(a[0])
        s += delta_state
    s = env.reset()
    plt.plot(states, actions)
    plt.show()
    print('TOTAL TIME:', time.time() - start_time)
    print("Time taken: %f seg" % ((time.time() - start_time)))
    print("Time taken: %f hours" % ((time.time() - start_time) / 3600))

    env.close()