def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) if algo == 'trpo': from baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, save_per_iter=save_per_iter, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.97, #0.995 as default vf_iters=5, vf_stepsize=1e-3, task_name=task_name) else: raise NotImplementedError
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) if algo == 'trpo': from baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, log_dir=log_dir, save_per_iter=save_per_iter, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, task_name=task_name) else: raise NotImplementedError
def irl(env, trajectories, discount, seed, log_dir, *, tf_cfg, policy_cfg=None, gan_cfg=None, train_cfg=None): dataset = _make_dset(trajectories) train_graph = tf.Graph() with train_graph.as_default(): tf.set_random_seed(seed) policy_fn = _policy_factory(policy_cfg) gan_kwargs = {'hidden_size': 100} if gan_cfg is not None: gan_kwargs.update(gan_cfg) reward_giver = TransitionClassifier(env, **gan_kwargs) train_kwargs = { 'pretrained': False, 'BC_max_iter': 10000, 'g_step': 3, # number of steps to train policy in each epoch 'd_step': 1, # number of steps to train discriminator in each epoch 'entcoeff': 0, # entropy coefficiency of policy 'max_timesteps': 5e6, # number of timesteps per episode 'timesteps_per_batch': 1024, 'max_kl': 0.01, 'cg_iters': 10, 'cg_damping': 0.1, 'lam': 0.97, 'vf_iters': 5, 'vf_stepsize': 1e-3, } if train_cfg is not None: train_kwargs.update(train_cfg) pretrained_weight = None bc_max_iter = train_kwargs.pop('BC_max_iter') if train_kwargs['pretrained']: # Pretrain with behavior cloning pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter) ckpt_dir = osp.join(log_dir, 'checkpoints') with tf.Session(config=tf_cfg) as sess: trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank=0, pretrained_weight=pretrained_weight, ckpt_dir=ckpt_dir, log_dir=log_dir, gamma=discount, save_per_iter=100, task_name='gail', **train_kwargs) policy_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'pi') policy_serialised = sess.run(policy_vars) return None, policy_serialised
def train(env, seed, writer, policy_fn, med_fn, dataset, g_step, m_step, e_step, inner_iters, pi_stepsize, med_stepsize, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) learner.learn(env, policy_fn, med_fn, dataset, pretrained, pretrained_weight, g_step, m_step, e_step, inner_iters, save_per_iter, checkpoint_dir, log_dir, med_stepsize=med_stepsize, pi_stepsize=pi_stepsize, max_timesteps=num_timesteps, timesteps_per_batch=1024, task_name=task_name, writer=writer)
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) # These are initialized to the same thing always so good #logger.log("all positions: \n", env.reset() ) # print the object positions #logger.log("all positions: \n", env.reset() ) # print the object positions to see if same if algo == 'trpo': from baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) #env.seed(workerseed) # removed since SawyerLift doesnt have seed # Adjustin trpo stuff trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, log_dir=log_dir, save_per_iter=save_per_iter, timesteps_per_batch=15000, # changed b=timesteps per batch for scaled env from 10000 max_kl=0.001, cg_iters=50, cg_damping=0.1, # maxkl was 0.01, cg iters was 10, cg_dampening from 0.1 gamma=0.995, lam=0.97, # originally 0.97 vf_iters=5, vf_stepsize=1e-3, task_name=task_name) else: raise NotImplementedError
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, pretrained, bc_max_iter, task_name=None): """ train gail on mujoco :param env: (Gym Environment) the environment :param seed: (int) the initial random seed :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action :param dataset: (MujocoDset) the dataset manager :param algo: (str) the algorithm type (only 'trpo' is supported) :param g_step: (int) number of steps to train policy in each epoch :param d_step: (int) number of steps to train discriminator in each epoch :param policy_entcoeff: (float) the weight of the entropy loss for the policy :param num_timesteps: (int) the number of timesteps to run :param save_per_iter: (int) the number of iterations before saving :param checkpoint_dir: (str) the location for saving checkpoints :param pretrained: (bool) use a pretrained behavior clone :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone :param task_name: (str) the name of the task (can be None) """ pretrained_weight = None if pretrained and (bc_max_iter > 0): # Pretrain with behavior cloning pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter) if algo == 'trpo': # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97, entcoeff=policy_entcoeff, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5, max_timesteps=num_timesteps, pretrained_weight=pretrained_weight, reward_giver=reward_giver, expert_dataset=dataset, rank=rank, save_per_iter=save_per_iter, ckpt_dir=checkpoint_dir, g_step=g_step, d_step=d_step, task_name=task_name) else: raise NotImplementedError
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, rew_lambda, mix_reward=False, task_name=None, frame_stack=1): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) if algo == 'trpo': from baselines.gail import trpo_mpi trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, log_dir=log_dir, save_per_iter=save_per_iter, timesteps_per_batch=env.env.horizon * 5, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-4, mix_reward=mix_reward, r_lambda=rew_lambda, task_name=task_name, frame_stack=frame_stack) elif algo == 'ppo': from baselines.gail import ppo_mpi ppo_mpi.learn( env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, log_dir=log_dir, save_per_iter=save_per_iter, timesteps_per_batch=env.env.horizon, #env.env.horizon * 5, gamma=0.995, lam=0.97, clip_param=0.2, optim_epochs=50, optim_stepsize=1e-4, optim_batchsize=100, # optimization hypers mix_reward=mix_reward, r_lambda=rew_lambda, task_name=task_name, frame_stack=frame_stack) else: raise NotImplementedError
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) if algo == 'trpo': from baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn( env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, #no pretrain g_step=g_step, d_step= d_step, #'--g_step', help='number of steps to train policy in each epoch', type=int, default=3) #'--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1) entcoeff=policy_entcoeff, #default=0 max_timesteps= num_timesteps, #'number of timesteps per episode', type=int, default=5e6) ckpt_dir=checkpoint_dir, log_dir=log_dir, #'the directory to save log file', default='log') save_per_iter= save_per_iter, #'save model every xx iterations', type=int, default=100) timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, task_name=task_name) else: raise NotImplementedError