def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) ####################################################### #in default one more normalization_observations=True!!!! ################################################################ ########!!!arguments for build_env need to be adjusted according to train+eval details later env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) ######################################################## if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=seed, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs) return model, env
def main(): arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() extra_args = parse_cmdline_kwargs(unknown_args) args.num_timesteps = 0 args.play = True args.env = 'YamaXRealForwardWalk-v0' model, env = train(args, extra_args) env.close() env = build_env(args) obs = env.reset() def initialize_placeholders(nlstm=128, **kwargs): return np.zeros((args.num_env or 1, 2 * nlstm)), np.zeros((1)) state, dones = initialize_placeholders(**extra_args) while True: actions, _, state, _ = model.step(obs, S=state, M=dones) obs, _, done, _ = env.step(actions) env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: obs = env.reset() env.close()
def train_copos(args): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[]) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) set_global_seeds(workerseed) env = build_env(args, normalize_ob=True) #env = gym.make(args.env) #env.seed(workerseed) timesteps_per_batch = 10000 #timesteps_per_batch=2048 beta = -1 if beta < 0: nr_episodes = int(args.num_timesteps) // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=int(args.num_timesteps), gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(args,extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #set_global_seeds(workerseed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args,normalize_ob=False,normalize_ret=False) if args.save_video_interval != 0: env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) #timesteps_per_batch=1024 #timesteps_per_batch=2048 beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy(env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1,) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs)) model=learn(env=env, seed=seed, beta=beta, total_timesteps=total_timesteps, **alg_kwargs) return model, env
def setUp(env, alg, load_path): args = Bunch({ 'env': env, 'alg': alg, 'num_timesteps': 0, 'seed': None, 'num_env': 1, 'network': None }) extra_args = {'load_path': load_path} model, env = train(args, extra_args) env.close() env = build_env(args, extra_args) return env, model
def main(): arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() args.num_env = 1 extra_args = parse_cmdline_kwargs(unknown_args) model, env = train(args, extra_args) env.close() logger.log("Running trained model") env = build_env(args) if not args.play: ts = time.gmtime() directory = time.strftime("./render/%s", ts) logger.log("Output video to directory:", directory) env.envs = [gym.wrappers.Monitor(env.envs[0], directory=directory)] obs = env.reset() def initialize_placeholders(nlstm=128, **kwargs): return np.zeros((args.num_env, 2 * nlstm)), np.zeros((1)) state, dones = initialize_placeholders(**extra_args) NUM_VIDEO = 1 while True: actions, _, state, _ = model.step(obs, S=state, M=dones) obs, _, done, _ = env.step(actions) if args.play: env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: NUM_VIDEO -= 1 if NUM_VIDEO <= 0: break obs = env.reset() env.close()
'save_interval': 20, 'log_interval': 1, 'save_path': save_path, 'model_load_path': model_load_path, 'seed': 0, 'reward_scale': 1, 'flatten_dict_observations': True, 'transfer_weights': False } args = SimpleNamespace(**args_dict) # Prepare the environment and learning algorithm env_type, env_id = get_env_type(args.env) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) env = build_env(args) alg_kwargs['network'] = args.network # The path we will store the results of this experiment full_path = args.save_path + '/' + args.env + '-' + args.alg # Make folders that we will store the checkpoints, models and epoch results if not os.path.exists(full_path): os.makedirs(full_path) os.makedirs(full_path + '/checkpoints') print("About to start learning model") model = learn(env=env, seed=args.seed, total_timesteps=args.total_timesteps,
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, beta=beta, total_timesteps=total_timesteps, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
'noptepochs': 10, 'save_interval': 20, 'log_interval': 1, 'save_path': save_path, 'model_load_path': model_load_path, 'seed': 0, 'reward_scale': 1, 'flatten_dict_observations': True } second_env_args = SimpleNamespace(**second_env_args_dict) # Prepare the environment and learning algorithm env_type, env_id = get_env_type(args.env) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) env = build_env(args) # Prepare the second environment if needed second_env = build_env(second_env_args) alg_kwargs['network'] = args.network # The path we will store the results of this experiment full_path = args.save_path + '/' + args.env + '-' + args.alg # Make folders that we will store the checkpoints, models and epoch results if not os.path.exists(full_path): os.makedirs(full_path) os.makedirs(full_path + '/checkpoints') print("About to start learning model")