def learn(policy, env, expert, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=32, nsteps=20, nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.05, max_grad_norm=0.5, kfac_clip=0.001, save_interval=100, lrschedule='linear'): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=1024, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() for _ in range(10000): e_obs, e_actions, _, _ = expert.get_next_batch(1024) e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))] lld_loss = model.clone(e_obs, e_a) print(lld_loss)
def create_env(): env = make_env.make_env(env_id) env.seed(10) # env = bench.Monitor(env, '/tmp/', allow_early_resets=True) set_global_seeds(10) return env
def create_env(): env = make_env.make_env('simple_spread') env.seed(3) env = bench.Monitor(env, '/tmp/', allow_early_resets=True) set_global_seeds(3) return env
lower_nb_actions = 2 TIME_STEPS = 10 TBD_total = 56 TBD_left = 38 TBD_straight = 56 TBD_right = 38 LSTM_HIDDEN = 128 ENCODE_LSTM_HIDDEN = 64 # Get the environment and extract the number of actions. curr_path = os.path.dirname(__file__) env = EndtoendEnv(setting_path=curr_path + '/LasVSim/Scenario/Highway_endtoend/', plan_horizon=30, history_len=TIME_STEPS) env = ObservationWrapper(env) set_global_seeds(11) # set of seeds: [42, 0, 1, 2] env.seed(11) # nb_actions = env.action_space.n def build_models(): # build upper model. upper_model = Sequential(name='upper_model') upper_model.add(layers.LSTM( 128, input_shape=(TIME_STEPS, TBD_total))) # A 3D tensor [batch, timesteps, inputdim] upper_model.add(layers.Dense(upper_nb_actions, activation='relu')) # build lower actor shared part---------------------------------------------- actor_lstm_model = Sequential(name='shared_actor_lstm_model')
def learn(policy, env, expert, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=4, nsteps=20, nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.05, max_grad_norm=0.5, kfac_clip=0.001, save_interval=1000, lrschedule='linear', batch_size=1024): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=batch_size, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() tstart = time.time() coord = tf.train.Coordinator() # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner] print("-------------------------------") print(total_timesteps // batch_size + 1) print("-------------------------------") for update in range(total_timesteps // batch_size + 1): e_obs, e_actions, _, _ = expert.get_next_batch(batch_size) e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))] nseconds = time.time() - tstart fps = int((update * batch_size) / nseconds) lld_loss = model.clone(e_obs, e_a)[0] # print(lld_loss) if update % log_interval == 0 or update == 1: logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * batch_size) logger.record_tabular("fps", fps) for k in range(model.num_agents): logger.record_tabular("lld_loss %d" % k, float(lld_loss[k])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop()