def __init__(self, n_particles, game, game_params, gamma): self.gamma = gamma self.envs = [] self.states = [] for _ in n_particles: env = make_game(game, game_params) env.seed(np.random.randint(1e7)) # draw some Env seed s = env.reset() self.envs.append(env) self.states.append(s)
def train(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units): """ Outer training loop """ episode_returns = [] # storage timepoints = [] # Environments if game == "teaching": env = TeachingEnv() bootstrap_last_state_value = False else: env = make_game(game) bootstrap_last_state_value = True is_atari = is_atari_game(env) mcts_env = make_game(game) if is_atari else None replay_buffer = ReplayBuffer(max_size=data_size, batch_size=batch_size) model = Model(env=env, lr=lr, n_hidden_layers=n_hidden_layers, n_hidden_units=n_hidden_units) t_total = 0 # total steps R_best = -np.Inf start_training = time.time() for ep in range(n_ep): start = time.time() s = env.reset() R = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some Env seed env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) # the object responsible for MCTS searches mcts = MCTS(root_index=s, root=None, model=model, na=model.action_dim, gamma=gamma, bootstrap_last_state_value=bootstrap_last_state_value) if game == "teaching": iterator = tqdm(range(env.t_max)) else: iterator = range(max_ep_len) for _ in iterator: # MCTS step mcts.search(n_mcts=n_mcts, c=c, env=env, mcts_env=mcts_env) # perform a forward search state, pi, v = mcts.return_results(temp) # extract the root output replay_buffer.store((state, v, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = env.step(a) R += r t_total += n_mcts # total number of environment steps (counts the mcts steps) if terminal: break else: mcts.forward(a, s1) # Finished episode episode_returns.append(R) # store the total episode return timepoints.append( t_total) # store the timestep count of the episode return store_safely({'R': episode_returns, 't': timepoints}) if R > R_best: a_best = a_store seed_best = seed R_best = R print(f'Finished episode {ep}, total return: {np.round(R,2)}, ' f'time episode: {time.time()-start:.1f} sec, ' f'total time since: {time.time()-start_training:.1f} sec') # Train replay_buffer.shuffle() for sb, vb, pib in replay_buffer: model.train_on_example(sb=sb, vb=vb, pib=pib) # Return results return episode_returns, timepoints, a_best, seed_best, R_best
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units, stochastic=False, eval_freq=-1, eval_episodes=100, alpha=0.6, out_dir='../', pre_process=None, visualize=False): ''' Outer training loop ''' if pre_process is not None: pre_process() # tf.reset_default_graph() if not os.path.exists(out_dir): os.makedirs(out_dir) episode_returns = [] # storage timepoints = [] # Environments Env = make_game(game) is_atari = is_atari_game(Env) mcts_env = make_game(game) if is_atari else None online_scores = [] offline_scores = [] mcts_params = dict(gamma=gamma) if stochastic: mcts_params['alpha'] = alpha mcts_maker = MCTSStochastic else: mcts_maker = MCTS D = Database(max_size=data_size, batch_size=batch_size) model = Model(Env=Env, lr=lr, n_hidden_layers=n_hidden_layers, n_hidden_units=n_hidden_units) t_total = 0 # total steps R_best = -np.Inf with tf.Session() as sess: model.sess = sess sess.run(tf.global_variables_initializer()) for ep in range(n_ep): if eval_freq > 0 and ep % eval_freq == 0: #and ep > 0 print( 'Evaluating policy for {} episodes!'.format(eval_episodes)) seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) s = Env.reset() mcts = mcts_maker(root_index=s, root=None, model=model, na=model.action_dim, **mcts_params) env_wrapper = EnvEvalWrapper() env_wrapper.mcts = mcts starting_states = [] def reset_env(): s = Env.reset() env_wrapper.mcts = mcts_maker(root_index=s, root=None, model=model, na=model.action_dim, **mcts_params) starting_states.append(s) if env_wrapper.curr_probs is not None: env_wrapper.episode_probabilities.append( env_wrapper.curr_probs) env_wrapper.curr_probs = [] return s def forward(a, s, r): env_wrapper.mcts.forward(a, s, r) #pass env_wrapper.reset = reset_env env_wrapper.step = lambda x: Env.step(x) env_wrapper.forward = forward env_wrapper.episode_probabilities = [] env_wrapper.curr_probs = None def pi_wrapper(ob): if not is_atari: mcts_env = None env_wrapper.mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) state, pi, V = env_wrapper.mcts.return_results(temp=0) #pi = model.predict_pi(s).flatten() env_wrapper.curr_probs.append(pi) a = np.argmax(pi) return a rews, lens = eval_policy(pi_wrapper, env_wrapper, n_episodes=eval_episodes, verbose=True) offline_scores.append([ np.min(rews), np.max(rews), np.mean(rews), np.std(rews), len(rews), np.mean(lens) ]) # if len(rews) < eval_episodes or len(rews) == 0: # print("WTF") # if np.std(rews) == 0.: # print("WTF 2") np.save(out_dir + '/offline_scores.npy', offline_scores) start = time.time() s = Env.reset() R = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) if ep % eval_freq == 0: print("Collecting %d episodes" % eval_freq) mcts = mcts_maker( root_index=s, root=None, model=model, na=model.action_dim, **mcts_params) # the object responsible for MCTS searches for t in range(max_ep_len): # MCTS step if not is_atari: mcts_env = None mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) # perform a forward search if visualize: mcts.visualize() state, pi, V = mcts.return_results( temp) # extract the root output D.store((state, V, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = Env.step(a) R += r t_total += n_mcts # total number of environment steps (counts the mcts steps) if terminal: break else: mcts.forward(a, s1, r) # Finished episode episode_returns.append(R) # store the total episode return online_scores.append(R) timepoints.append( t_total) # store the timestep count of the episode return store_safely(out_dir, 'result', { 'R': episode_returns, 't': timepoints }) np.save(out_dir + '/online_scores.npy', online_scores) # print('Finished episode {}, total return: {}, total time: {} sec'.format(ep, np.round(R, 2), # np.round((time.time() - start), # 1))) if R > R_best: a_best = a_store seed_best = seed R_best = R # Train D.reshuffle() try: for epoch in range(1): for sb, Vb, pib in D: model.train(sb, Vb, pib) except Exception as e: print("ASD") model.save(out_dir + 'model') # Return results return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
if not args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = "" # Accept custom grid if the environment requires it if args.game == 'Taxi' or args.game == 'TaxiEasy': game_params['grid'] = args.grid game_params['box'] = True if args.game == 'RaceStrategy-v0': game_params['horizon'] = args.horizon basedir = args.logdir ts = str(time.time()) logdir = basedir + args.game + '/' + ts + '/' if not os.path.exists(logdir): os.makedirs(logdir) env = make_game(args.game, game_params) state_dim = env.observation_space.shape[0] discrete = True try: action_dim = env.action_space.n except: action_dim = env.action_space.high.shape[0] discrete = False pi = load_policy(model_path='', input_dim=state_dim, output_dim=action_dim, num_hidden=args.n_hidden_units, num_layers=args.n_hidden_layers, discrete=discrete, beta=1.0)
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units): ''' Outer training loop ''' # tf.reset_default_graph() episode_returns = [] # storage timepoints = [] # Environments Env = make_game(game) is_atari = is_atari_game(Env) mcts_env = make_game(game) if is_atari else None D = Database(max_size=data_size, batch_size=batch_size) model = Model(Env=Env, lr=lr, n_hidden_layers=n_hidden_layers, n_hidden_units=n_hidden_units) t_total = 0 # total steps R_best = -np.Inf with tf.Session() as sess: model.sess = sess sess.run(tf.global_variables_initializer()) for ep in range(n_ep): start = time.time() s = Env.reset() R = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) mcts = MCTS( root_index=s, root=None, model=model, na=model.action_dim, gamma=gamma) # the object responsible for MCTS searches for t in range(max_ep_len): # MCTS step mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) # perform a forward search state, pi, V = mcts.return_results( temp) # extract the root output D.store((state, V, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = Env.step(a) R += r t_total += n_mcts # total number of environment steps (counts the mcts steps) if terminal: break else: mcts.forward(a, s1) # Finished episode episode_returns.append(R) # store the total episode return timepoints.append( t_total) # store the timestep count of the episode return store_safely(os.getcwd(), 'result', { 'R': episode_returns, 't': timepoints }) if R > R_best: a_best = a_store seed_best = seed R_best = R print('Finished episode {}, total return: {}, total time: {} sec'. format(ep, np.round(R, 2), np.round((time.time() - start), 1))) # Train D.reshuffle() for epoch in range(1): for sb, Vb, pib in D: model.train(sb, Vb, pib) # Return results return episode_returns, timepoints, a_best, seed_best, R_best
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units, stochastic=False, eval_freq=-1, eval_episodes=100, alpha=0.6, n_epochs=100, c_dpw=1, numpy_dump_dir='../', pre_process=None, visualize=False, game_params={}, parallelize_evaluation=False, mcts_only=False, particles=0, show_plots=False, n_workers=1, use_sampler=False, budget=np.inf, unbiased=False, biased=False, max_workers=100, variance=False, depth_based_bias=False, scheduler_params=None, out_dir=None, render=False, second_version=False, third_version=False): visualizer = None # if particles: # parallelize_evaluation = False # Cannot run parallelized evaluation with particle filtering if not mcts_only: from mcts import MCTS from mcts_dpw import MCTSStochastic elif particles: if unbiased: from particle_filtering.ol_uct import OL_MCTS elif biased: if second_version: from particle_filtering.pf_uct_2 import PFMCTS2 as PFMCTS elif third_version: from particle_filtering.pf_uct_3 import PFMCTS3 as PFMCTS else: from particle_filtering.pf_uct import PFMCTS else: from particle_filtering.pf_mcts_edo import PFMCTS else: from pure_mcts.mcts import MCTS from pure_mcts.mcts_dpw import MCTSStochastic if parallelize_evaluation: print("The evaluation will be parallel") parameter_list = { "game": game, "n_ep": n_ep, "n_mcts": n_mcts, "max_ep_len": max_ep_len, "lr": lr, "c": c, "gamma": gamma, "data_size": data_size, "batch_size": batch_size, "temp": temp, "n_hidden_layers": n_hidden_layers, "n_hidden_units": n_hidden_units, "stochastic": stochastic, "eval_freq": eval_freq, "eval_episodes": eval_episodes, "alpha": alpha, "n_epochs": n_epochs, "out_dir": numpy_dump_dir, "pre_process": pre_process, "visualize": visualize, "game_params": game_params, "n_workers": n_workers, "use_sampler": use_sampler, "variance": variance, "depth_based_bias": depth_based_bias, "unbiased": unbiased, "second_version": second_version, 'third_version': third_version } if out_dir is not None: if not os.path.exists(out_dir): os.makedirs(out_dir) with open(os.path.join(out_dir, "parameters.txt"), 'w') as d: d.write(json.dumps(parameter_list)) #logger = Logger(parameter_list, game, show=show_plots) if DEBUG_TAXI: from utils.visualization.taxi import TaxiVisualizer with open(game_params["grid"]) as f: m = f.readlines() matrix = [] for r in m: row = [] for ch in r.strip('\n'): row.append(ch) matrix.append(row) visualizer = TaxiVisualizer(matrix) f.close() exit() ''' Outer training loop ''' if pre_process is not None: pre_process() # numpy_dump_dir = logger.numpy_dumps_dir # # if not os.path.exists(numpy_dump_dir): # os.makedirs(numpy_dump_dir) episode_returns = [] # storage timepoints = [] # Environments if game == 'Trading-v0': game_params['save_dir'] = out_dir #logger.save_dir Env = make_game(game, game_params) num_actions = Env.action_space.n sampler = None if use_sampler and not (unbiased or biased): def make_pi(action_space): def pi(s): return np.random.randint(low=0, high=action_space.n) return pi def make_env(): return make_game(game, game_params) sampler = ParallelSampler(make_pi=make_pi, make_env=make_env, n_particles=particles, n_workers=n_workers, seed=10) is_atari = is_atari_game(Env) mcts_env = make_game(game, game_params) if is_atari else None online_scores = [] offline_scores = [] # Setup the parameters for generating the search environments if game == "RaceStrategy-v1": mcts_maker, mcts_params, c_dpw = load_race_agents_config( 'envs/configs/race_strategy_full.json', gamma) else: mcts_params = dict(gamma=gamma) if particles: if not (biased or unbiased): mcts_params['particles'] = particles mcts_params['sampler'] = sampler elif biased: mcts_params['alpha'] = alpha mcts_maker = PFMCTS mcts_params['depth_based_bias'] = depth_based_bias if unbiased: mcts_params['variance'] = variance mcts_maker = OL_MCTS elif stochastic: mcts_params['alpha'] = alpha mcts_params['depth_based_bias'] = depth_based_bias mcts_maker = MCTSStochastic else: mcts_maker = MCTS # Prepare the database for storing training data to be sampled db = Database(max_size=data_size, batch_size=batch_size) # TODO extract dimensions to avoid allocating model # Setup the model model_params = { "Env": Env, "lr": lr, "n_hidden_layers": n_hidden_layers, "n_hidden_units": n_hidden_units, "joint_networks": True } model_wrapper = ModelWrapper(**model_params) t_total = 0 # total steps R_best = -np.Inf a_best = None seed_best = None # Variables for storing values to be plotted avgs = [] stds = [] # Run the episodes for ep in range(n_ep): if DEBUG_TAXI: visualizer.reset() ##### Policy evaluation step ##### if eval_freq > 0 and ep % eval_freq == 0: # and ep > 0 print( '--------------------------------\nEvaluating policy for {} episodes!' .format(eval_episodes)) seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) s = Env.reset() if parallelize_evaluation: penv = None pgame = { "game_maker": make_game, "game": game, "game_params": game_params } else: penv = Env pgame = None model_file = os.path.join(out_dir, "model.h5") # model_wrapper.save(model_file) if game == "RaceStrategy-v1": env_wrapper = RaceWrapper(s, mcts_maker, model_file, model_params, mcts_params, is_atari, n_mcts, budget, mcts_env, c_dpw, temp, env=penv, game_maker=pgame, mcts_only=mcts_only, scheduler_params=scheduler_params) else: env_wrapper = Wrapper(s, mcts_maker, model_file, model_params, mcts_params, is_atari, n_mcts, budget, mcts_env, c_dpw, temp, env=penv, game_maker=pgame, mcts_only=mcts_only, scheduler_params=scheduler_params) # Run the evaluation if parallelize_evaluation: total_reward, reward_per_timestep, lens, action_counts = \ parallelize_eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len, max_workers=max_workers, out_dir=out_dir) else: total_reward, reward_per_timestep, lens, action_counts = \ eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len, visualize=visualize, out_dir=out_dir, render=render) # offline_scores.append([np.min(rews), np.max(rews), np.mean(rews), np.std(rews), # len(rews), np.mean(lens)]) offline_scores.append( [total_reward, reward_per_timestep, lens, action_counts]) #np.save(numpy_dump_dir + '/offline_scores.npy', offline_scores) # Store and plot data avgs.append(np.mean(total_reward)) stds.append(np.std(total_reward)) #logger.plot_evaluation_mean_and_variance(avgs, stds) ##### Policy improvement step ##### if not mcts_only: start = time.time() s = start_s = Env.reset() R = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some Env seed Env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) if eval_freq > 0 and ep % eval_freq == 0: print("\nCollecting %d episodes" % eval_freq) mcts = mcts_maker( root_index=s, root=None, model=model_wrapper, na=model_wrapper.action_dim, **mcts_params) # the object responsible for MCTS searches print("\nPerforming MCTS steps\n") ep_steps = 0 start_targets = [] for st in range(max_ep_len): print_step = max_ep_len // 10 if st % print_step == 0: print('Step ' + str(st + 1) + ' of ' + str(max_ep_len)) # MCTS step if not is_atari: mcts_env = None mcts.search(n_mcts=n_mcts, c=c, Env=Env, mcts_env=mcts_env) # perform a forward search if visualize: mcts.visualize() state, pi, V = mcts.return_results( temp) # extract the root output # Save targets for starting state to debug if np.array_equal(start_s, state): if DEBUG: print("Pi target for starting state:", pi) start_targets.append((V, pi)) db.store((state, V, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = Env.step(a) # Perform command line visualization if necessary if DEBUG_TAXI: olds, olda = copy.deepcopy(s1), copy.deepcopy(a) visualizer.visualize_taxi(olds, olda) print("Reward:", r) R += r t_total += n_mcts # total number of environment steps (counts the mcts steps) ep_steps = st + 1 if terminal: break # Stop the episode if we encounter a terminal state else: mcts.forward(a, s1, r) # Otherwise proceed # Finished episode if DEBUG: print("Train episode return:", R) print("Train episode actions:", a_store) episode_returns.append(R) # store the total episode return online_scores.append(R) timepoints.append( t_total) # store the timestep count of the episode return #store_safely(numpy_dump_dir, '/result', {'R': episode_returns, 't': timepoints}) #np.save(numpy_dump_dir + '/online_scores.npy', online_scores) if DEBUG or True: print( 'Finished episode {} in {} steps, total return: {}, total time: {} sec' .format(ep, ep_steps, np.round(R, 2), np.round((time.time() - start), 1))) # Plot the online return over training episodes #logger.plot_online_return(online_scores) if R > R_best: a_best = a_store seed_best = seed R_best = R print() # Train only if the model has to be used if not mcts_only: # Train try: print("\nTraining network") ep_V_loss = [] ep_pi_loss = [] for _ in range(n_epochs): # Reshuffle the dataset at each epoch db.reshuffle() batch_V_loss = [] batch_pi_loss = [] # Batch training for sb, Vb, pib in db: if DEBUG: print("sb:", sb) print("Vb:", Vb) print("pib:", pib) loss = model_wrapper.train(sb, Vb, pib) batch_V_loss.append(loss[1]) batch_pi_loss.append(loss[2]) ep_V_loss.append(mean(batch_V_loss)) ep_pi_loss.append(mean(batch_pi_loss)) # Plot the loss over training epochs #logger.plot_loss(ep, ep_V_loss, ep_pi_loss) except Exception as e: print("Something wrong while training:", e) # model.save(out_dir + 'model') # Plot the loss over different episodes #logger.plot_training_loss_over_time() pi_start = model_wrapper.predict_pi(start_s) V_start = model_wrapper.predict_V(start_s) print("\nStart policy: ", pi_start) print("Start value:", V_start) #logger.log_start(ep, pi_start, V_start, start_targets) # Return results if use_sampler: sampler.close() return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
def make_env(): return make_game(game, game_params)
def __init__(self, hps, cluster_spec=None): self.hps = hps if hps.distributed: ps_servers = [ "/job:ps/task:{}".format(ps_num) for ps_num in range(hps.n_ps) ] config = tf.ConfigProto( device_filters=ps_servers + ['/job:ps', '/job:worker/task:{}'.format(hps.job_index)], #device_count={"CPU": hps.num_agents, "GPU" : 0}, #allow_soft_placement=True, inter_op_parallelism_threads=2, intra_op_parallelism_threads=1, log_device_placement=False) server = tf.train.Server(cluster_spec, config=config, job_name="worker", task_index=hps.job_index) else: server, config = None, None # Environment Env = make_game(hps.game) try: hps.max_ep_len = Env._max_episode_steps - 1 logger.info('Set max steps per episode to {}'.format( hps.max_ep_len)) except: logger.info( 'Environment does not have a time limit wrapper, using {} max steps per episode' .format(hps.max_ep_len)) hps.action_dim, hps.action_discrete = check_space(Env.action_space) hps.state_dim, hps.state_discrete = check_space(Env.observation_space) if not hps.action_discrete: raise ValueError('Continuous action space not implemented') # Seed seed = np.random.randint(1e8) + 7 * hps.job_index np.random.seed(seed) random.seed(seed) # Network model, target_model, copy_op, global_model, global_target_model, global_copy_op, sync_op = make_network( hps, cluster_spec) logger.info('Total number of trainable parameters {} million'.format( model_description(model.var_list) / (1e6))) if not hps.distributed: with tf.Session() as sess: logger.info('Initializing ..') sess.run(tf.global_variables_initializer()) run(Env, hps, sess, model, target_model, copy_op, global_model, global_target_model, global_copy_op, sync_op) else: # make init op global_init_op, local_init_op, global_variables = make_init_ops() sv = tf.train.Supervisor( is_chief=(hps.job_index == 0), init_op=global_init_op, local_init_op=local_init_op, ready_op=tf.report_uninitialized_variables( tf.global_variables()), #logdir = hps.result_dir, ) #print('Im worker {} before the supervisor'.format(hps.job_index)) with sv.managed_session(server.target, config=config) as sess, sess.as_default(): #print('Im worker {} after the supervisor'.format(hps.job_index)) sess.run(sync_op) run(Env, hps, sess, model, target_model, copy_op, global_model, global_target_model, global_copy_op, sync_op, sv) sv.stop()
def run(Env, hps, sess, model, target_model=None, copy_op=None, global_model=None, global_target_model=None, global_copy_op=None, sync_op=None, sv=None): begin = overall_begin = time.time() print('Im worker {} and starting'.format(hps.job_index)) # Database D = Database(data_size=hps.n_ep_collect * hps.max_ep_len * hps.n_rep_target * 2, batch_size=hps.batch_size, entry_type='sequential') if hps.replay_size > 0: D_sars = Replay(max_size=hps.replay_size, prioritized_frac=hps.prioritized_frac) #D_sars = Database(data_size=hps.replay_size, batch_size=hps.batch_size,entry_type='sequential') #saver = tf.train.Saver(max_to_keep=10) # Counters Copy_count = Interval_checker(hps.max_ep, hps.t_max, hps.store_copy_freq) Eval_count = Interval_checker(hps.max_ep, hps.t_max, hps.evaluate_freq) if hps.game == 'Toy' and hps.visualize: from rl.envs.toy import ToyPlotter, ToyDomainPlotter toy_plotter = ToyPlotter() toy_domain_plotter = ToyDomainPlotter(hps) elif 'Chain' in hps.game and hps.visualize: from rl.envs.chain import ChainPlotter, ChainDomainPlotter chain_plotter = ChainPlotter(Env.correct, n_plot=Env.n) chain_domain_plotter = ChainDomainPlotter(Env) # Discretizer = TransformDiscrete(hps.disc_n,hps.disc_min,hps.disc_max) # ema = EMA() if 'Chain' in hps.game: # necessary for plotting visitation counts correctly test_Env = make_game(hps.game) test_Env.correct = Env.correct else: test_Env = Env t = 0 ep = 0 log_count = 0 time_result = TimedResults( n=6) # Average Reward, Empirical Loss, Qsa_norm, grad_norm, loss #epsilon = AnnealLinear(hps.e_init,hps.e_final,int(hps.anneal_frac*hps.t_max)) time_check = 't < hps.t_max' if not hps.distributed else '(t < hps.t_max) and not sv.should_stop()' running_mean = 0.0 frac = 0.97 while eval(time_check): # train loop now = time.time() if hps.distributed: sess.run(sync_op) e = sess.run(model.epsilon) # Collect data t_new, t_, av_R, ep_R, data = collect_data(hps, model, sess, Env, e) # Process new (on-policy) data D.clear() # clear training database D, D_sars, Qsa_norm, Qsa_sds = calculate_targets( data, D, D_sars, model, sess, hps, hps.lambda_, off_policy=hps.off_policy, target_model=target_model) # Fill up database from replay if D_sars.size > hps.min_replay_size: extra_needed = hps.replay_frac * D.size more = hps.batch_size - (D.size + extra_needed) % hps.batch_size if more > 0: extra_needed += more # to make full minibatches from rollout if (extra_needed > 0) and (extra_needed <= D_sars.size): # draw the samples sb, ab, rb, s1b, tb = D_sars.sample_random_batch( extra_needed, True) # sample from replay D, D_sars, Qsa_norm2 = calculate_targets_off_policy( sb, ab, rb, s1b, tb, D, D_sars, model, sess, hps, target_model) # Process the extra data (off-policy) # Train gradient_norm, clipped_norm, loss = train(hps, model, sess, D) # # Put new data in replay database # if hps.replay_size > 0: # for rollout in data: # si,ai,ri,st = rollout.extract() # tt = np.zeros(si.shape[0]) # tt[-1] = rollout.terminal # si1 = np.concatenate([si[1:,],st[None,:]],axis=0) # D_sars.store_from_array(si,ai,ri,si1,tt) # # Send data to replay database # if hps.replay_frac > 0: # if hps.replay_size > 0: # for j,rollout_data in enumerate(data): # rollout_data.seed = None # seed becomes irrelevant # D_replay.put(rollout_data,priority=prios)#-1.0*prios[j]) # # if hps.replay_size > 0 and len(D_replay.q)>(hps.replay_frac*len(data)*2): # replay_data = D_replay.get(hps.replay_frac*len(data)) # # replay always off-policy (lambda_=0.0) for sample-based loss # D,_ = calculate_targets(replay_data,D,model,sess,hps,lambda_=0.0,off_policy=hps.off_policy,target_model=target_model) # new_prios = 0.0 # #D,_,new_prios = process(replay_data,D,model,sess,hps,target_model=target_model,ema=ema) # train(hps,model,sess,D) # D.clear() # if hps.prioritized_frac>0.0: # for i in range(len(replay_data)): # D_replay.put(replay_data[i],priority=-1.0*new_prios[i]) # Counters _, _, t, ep = sess.run( [model.inc_t, model.inc_ep, model.global_t, model.global_ep], feed_dict={ model.t: t_new, model.ep: len(ep_R) }) if hps.level == 'debug': memory_use = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 1024 print( 'Process {} = worker {} loops in {} seconds, memory use {} Mb for replay size {}' .format(hps.agent_index, hps.job_index, time.time() - now, memory_use, D_sars.size)) #else: # print('Memory use {} Mb'.format(memory_use)) if (not hps.distributed) or (hps.distributed and hps.job_index == 0): # Store episode_reward, average_reward = np.mean(ep_R), np.mean(av_R) time_result.add([ep - (len(ep_R) / 2)], [average_reward], [episode_reward], [np.mean(Qsa_norm)], [gradient_norm], [loss], [int(t - t_new / 2)]) #logger.info('Evaluated: Ep {:5d}, t {:5d}: Ep return {:4.2f}, Running mean {:4.2f}, Qsa_norm {:4.2f}, grad_norm {:4.2f}, clipped_norm {:4.2f}, loss {:4.2f}, episode_length {:3.1f}'.format(ep,t,episode_reward,running_mean,np.mean(Qsa_norm),gradient_norm,clipped_norm,loss,np.mean(t_))) ep_curve, av_R_curve, ep_R_curve, Qsa_norm_curve, grad_norm_curve, loss_curve, t_curve = time_result.extract( ) downsample_store(hps, ep_curve, ep_R_curve, av_R_curve, Qsa_norm_curve, grad_norm_curve, loss_curve, t_curve, out_len=1000) # Logging if (t // hps.log_interval > log_count) and (not hps.slurm): running_mean = frac * running_mean + (1 - frac) * np.mean(ep_R) logger.info( 'Ep {:4d}, t {:5d}: Ep Return {:4.2f}, Run_mean {:4.2f}, Qsa_mean {:4.2f}, Qsa_sd {:4.2f}, grad_norm {:4.2f}, clip_norm {:4.2f}, loss {:4.2f}, ep_len {:3.1f}' .format(ep, t, np.mean(ep_R), running_mean, np.mean(Qsa_norm), Qsa_sds, gradient_norm, clipped_norm, loss, np.mean(t_))) log_count += 1 # Copy target network if Copy_count.should_update(t, ep): # Target net if hps.target_net: if hps.distributed: sess.run(global_copy_op) else: sess.run(copy_op) # for uncer == 'log_bay' --> set new uncertainty estimates # sample larger X and Y batch: if hps.uncer == 'log_bay': D.clear() log_bay_sample = np.min([D_sars.size, 3000]) sb, ab, rb, s1b, tb = D_sars.sample_random_batch( log_bay_sample, True) # sample from replay D, D_sars, Qsa_norm2 = calculate_targets_off_policy( sb, ab, rb, s1b, tb, D, D_sars, model, sess, hps, target_model) # Process the extra data (off-policy) s_batch, a_batch, y_batch = D.sample_random_batch( log_bay_sample) seed = [np.random.randint(1e15), np.random.randint(1e15) ] # just need to feed something new for each batch feed_dict = { model.x: s_batch, model.a: a_batch, model.y: y_batch, model.seed: seed } sess.run(model.lin_bay_update, feed_dict=feed_dict) # Visualize for Toy/Chain if hps.game == 'Toy' and hps.visualize: if Eval_count.should_update(t, ep): toy_plotter.update(sess, model, hps, ep) toy_domain_plotter.update(Env.counts) elif 'Chain' in hps.game and hps.visualize: if Eval_count.should_update(t, ep): chain_plotter.update(sess, model, hps, ep) chain_domain_plotter.update(Env.counts) elif hps.visualize: if Eval_count.should_update(t, ep): episode_reward, average_reward = evaluate( test_Env, hps, model, sess) # Store model #if rep == 0: # saver.save(sess,hps.result_dir+make_name('',hps.item1,item1,hps.item2,item2,hps.item3,item3,hps.item4,item4)+'model.ckpt') # if Eval_count.should_update(t,ep): # if hps.distributed: # sess.run(sync_op) # episode_reward,average_reward = evaluate(test_Env,hps,model,sess) # time_result.add([ep - (len(ep_R)/2)],[average_reward],[episode_reward],[np.mean(ep_R)],[np.mean(Qsa_norm)],[gradient_norm],[loss]) # logger.info('Evaluated: Ep {:5d}, t {:5d}: Ep return {:4.2f}, Running mean {:4.2f}, Qsa_norm {:4.2f}, grad_norm {:4.2f}, clipped_norm {:4.2f}, loss {:4.2f}, episode_length {:3.1f}'.format(ep,t,episode_reward,running_mean,np.mean(Qsa_norm),gradient_norm,clipped_norm,loss,np.mean(t_))) # ep_curve,av_R_curve,ep_R_curve,Qsa_norm_curve,grad_norm_curve,loss_curve = time_result.extract() # downsample_store(hps,ep_curve,ep_R_curve,av_R_curve,Qsa_norm_curve,grad_norm_curve,loss_curve,out_len=1000) if (ep > hps.max_ep) or (t > hps.t_max): if hps.distributed: sv.request_stop() break # max number of episodes overrules max number of steps elapsed = (time.time() - overall_begin) / 60 logger.info( 'Reached {} episodes in {} timesteps, took {} hours = {} minutes'. format(ep, t, elapsed / 60, elapsed)) if (not hps.distributed) or (hps.disributed and hps.job_index == 0): ep_curve, av_R_curve, ep_R_curve, Qsa_norm_curve, grad_norm_curve, loss_curve, t_curve = time_result.extract( ) downsample_store(hps, ep_curve, ep_R_curve, av_R_curve, Qsa_norm_curve, grad_norm_curve, loss_curve, t_curve, out_len=1000) # Compute best result (last 10%) fraction = 0.1 save_from_index = int(np.ceil(len(ep_curve) * fraction)) ep_R_best = np.mean(ep_R_curve[save_from_index:]) av_R_best = np.mean(av_R_curve[save_from_index:]) np.savetxt(hps.result_dir + 'best_results.txt', np.append(ep_R_best, av_R_best), fmt='%.3g')
def agent(game, n_ep, n_mcts, max_ep_len, lr, c, gamma, data_size, batch_size, temp, n_hidden_layers, n_hidden_units): """ Outer training loop """ seed_best = None a_best = None episode_returns = [] # storage timepoints = [] # environments env = make_game(game) is_atari = is_atari_game(env) mcts_env = make_game(game) if is_atari else None database = Database(max_size=data_size, batch_size=batch_size) model = Model(env=env, lr=lr, n_hidden_layers=n_hidden_layers, n_hidden_units=n_hidden_units) t_total = 0 # total steps r_best = -np.Inf for ep in range(n_ep): start = time.time() s = env.reset() r2 = 0.0 # Total return counter a_store = [] seed = np.random.randint(1e7) # draw some env seed env.seed(seed) if is_atari: mcts_env.reset() mcts_env.seed(seed) mcts = MCTS(root_index=s, model=model, na=model.action_dim, gamma=gamma) # the object responsible for MCTS searches for t in range(max_ep_len): # MCTS step mcts.search(n_mcts=n_mcts, c=c, env=env, mcts_env=mcts_env) # perform a forward search state, pi, v = mcts.return_results(temp) # extract the root output database.store((state, v, pi)) # Make the true step a = np.random.choice(len(pi), p=pi) a_store.append(a) s1, r, terminal, _ = env.step(a) r2 += r # total number of environment steps (counts the mcts steps) t_total += n_mcts if terminal: break else: mcts.forward(a, s1) # Finished episode episode_returns.append(r2) # store the total episode return timepoints.append( t_total) # store the timestep count of the episode return store_safely(os.getcwd(), 'result', { 'r': episode_returns, 't': timepoints }) if r2 > r_best: a_best = a_store seed_best = seed r_best = r2 print( 'Finished episode {}, total return: {}, total time: {} sec'.format( ep, np.round(r2, 2), np.round((time.time() - start), 1))) # Train database.reshuffle() for epoch in range(1): for sb, v_batch, pi_batch in database: model.train(sb, v_batch, pi_batch) # return results return episode_returns, timepoints, a_best, seed_best, r_best
def train_trpo(game, num_timesteps, eval_episodes, seed, horizon, out_dir='.', load_path=None, checkpoint_path_in=None, gamma=0.99, timesteps_per_batch=500, num_layers=0, num_hidden=32, checkpoint_freq=20, max_kl=0.01): start_time = time.time() clip = None dir = 'game' game_params = {} # Accept custom grid if the environment requires it if game == 'Taxi' or game == 'TaxiEasy': game_params['grid'] = args.grid game_params['box'] = True if game in ['RaceStrategy-v0', 'Cliff-v0']: game_params['horizon'] = horizon # env = Race(gamma=gamma, horizon=horizon, ) # env_eval = Race(gamma=gamma, horizon=horizon) env = make_game(args.game, game_params) env_eval = make_game(args.game, game_params) directory_output = (dir + '/trpo_' + str(num_layers) + '_'+ str(num_hidden) + '_'+ str(max_kl) + '/') def eval_policy_closure(**args): return eval_policy(env=env_eval, gamma=gamma, **args) tf.set_random_seed(seed) sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() time_str = str(start_time) if rank == 0: logger.configure(dir=out_dir + '/' + directory_output + '/logs', format_strs=['stdout', 'csv'], suffix=time_str) else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) network = mlp(num_hidden=num_hidden, num_layers=num_layers) optimized_policy = trpo_mpi.learn(network=network, env=env, eval_policy=eval_policy_closure, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=10, cg_damping=1e-3, total_timesteps=num_timesteps, gamma=gamma, lam=1.0, vf_iters=3, vf_stepsize=1e-4, checkpoint_freq=checkpoint_freq, checkpoint_dir_out=out_dir + '/' + directory_output + '/models/' + time_str + '/', load_path=load_path, checkpoint_path_in=checkpoint_path_in, eval_episodes=eval_episodes, init_std=1, trainable_variance=True, trainable_bias=True, clip=clip) s = env.reset() done = False states = [] actions = [] s = 0 delta_state = 0.2 while s < env.dim[0]: a, _, _, _ = optimized_policy.step([s]) states.append(s) actions.append(a[0]) s += delta_state s = env.reset() plt.plot(states, actions) plt.show() print('TOTAL TIME:', time.time() - start_time) print("Time taken: %f seg" % ((time.time() - start_time))) print("Time taken: %f hours" % ((time.time() - start_time) / 3600)) env.close()