def train(env_id, num_timesteps, seed): from baselines.ppo_pnp import mlp_policy, pposgd_simple, interactive_ppo, ppo_gail U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3) env = JacoEnv(64, 64, 1, 1.0) #make_mujoco_env(env_id, seed) dataset = Mujoco_Dset(expert_path='data/pnp_demo.npz', traj_limitation=-1) reward_giver = TransitionClassifier(env, 100, entcoeff=1e-3) ppo_gail.learn( env, policy_fn, reward_giver, dataset, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess, hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) if args.task == 'train': dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier( env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, args.bc_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) # env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) if args.log_dir != Log_dir: log_dir = osp.join(Log_dir, args.log_dir) save_dir = osp.join(Checkpoint_dir, args.log_dir) else: log_dir = Log_dir save_dir = Checkpoint_dir logger.configure(dir=log_dir, log_suffix=task_name, format_strs=["log", "stdout"]) if args.task == 'train': log_dir, data_dir, policy_model_dir, __, _ = get_dirs(args) print("log_dir: ", log_dir) print("model_dir: ", policy_model_dir) #exp_data = get_exp_data2(osp.join(osp.dirname(osp.realpath(__file__)), "../../data/mujoco/%s.pkl" % args.env_id)) data_path = data_dir + '/expert_sample' # eric version exp_data = get_exp_data(data_path, args.num_trajs) dataset = Dataset(exp_data) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.num_iters, args.save_per_iter, save_dir, args.pretrained, args.BC_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) #env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.CNNPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) #env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json")) env = make_vec_env(args.env_id, 'atari', 1, args.seed, wrapper_kwargs={ 'clip_rewards':False, 'episode_life':False, }) env = VecFrameStack(env, 4) #env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = LMDB_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) env = DelayRewardWrapper(env, args.delay_freq, args.max_path_length) eval_env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2, gaussian_fixed_var=args.gaussian_fixed_var) env.seed(args.seed) eval_env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, eval_env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.num_epochs, args.evaluation_freq, args.timesteps_per_batch, task_name, ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=args.timesteps_per_batch, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def irl(env, trajectories, discount, seed, log_dir, *, tf_cfg, policy_cfg=None, gan_cfg=None, train_cfg=None): dataset = _make_dset(trajectories) train_graph = tf.Graph() with train_graph.as_default(): tf.set_random_seed(seed) policy_fn = _policy_factory(policy_cfg) gan_kwargs = {'hidden_size': 100} if gan_cfg is not None: gan_kwargs.update(gan_cfg) reward_giver = TransitionClassifier(env, **gan_kwargs) train_kwargs = { 'pretrained': False, 'BC_max_iter': 10000, 'g_step': 3, # number of steps to train policy in each epoch 'd_step': 1, # number of steps to train discriminator in each epoch 'entcoeff': 0, # entropy coefficiency of policy 'max_timesteps': 5e6, # number of timesteps per episode 'timesteps_per_batch': 1024, 'max_kl': 0.01, 'cg_iters': 10, 'cg_damping': 0.1, 'lam': 0.97, 'vf_iters': 5, 'vf_stepsize': 1e-3, } if train_cfg is not None: train_kwargs.update(train_cfg) pretrained_weight = None bc_max_iter = train_kwargs.pop('BC_max_iter') if train_kwargs['pretrained']: # Pretrain with behavior cloning pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter) ckpt_dir = osp.join(log_dir, 'checkpoints') with tf.Session(config=tf_cfg) as sess: trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank=0, pretrained_weight=pretrained_weight, ckpt_dir=ckpt_dir, log_dir=log_dir, gamma=discount, save_per_iter=100, task_name='gail', **train_kwargs) policy_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'pi') policy_serialised = sess.run(policy_vars) return None, policy_serialised
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = AtariDataset(data_path=args.expert_path, game='pinball', max_nb_transitions=5) #TODO: change max_nb_transitions reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=256, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, play=args.play, ) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) if args.task == 'train': env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) os.makedirs(args.log_dir, exist_ok=True) with open(osp.join(args.log_dir, 'args.txt'), 'w') as f: f.write(str(args)) args.checkpoint_dir = osp.join(args.log_dir, 'chckpts') os.makedirs(args.checkpoint_dir, exist_ok=True) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = robosuite.make(args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3 #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env) # wrap in the gym environment # Environment joints should be clipped at 1 and -1 for sawyer # Task #task = 'train' task = 'evaluate' # parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train') # Expert Path #expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/ac100/combined/combined_0.npz' # path for 100 trajectories expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/120_shift7/combined/combined_0.npz' # path for 100 trajectories #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) #if not os.path.isdir(args.log_dir): # os.makedirs(args.log_dir) logger.log("log_directories: ",args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space if task == 'train': dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation) # Check dimensions of the dataset #print("dimension of inputs", dataset.dset.inputs.shape) # dims seem correct #print("dimension of inputs", dataset.dset.labels.shape) # dims seem correct reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif task == 'evaluate': # Create the playback environment play_env = robosuite.make(args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3 #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) #play_env.viewer.set_camera(camera_id=2) # Switch views for eval runner(env, play_env, policy_fn, args.load_model_path, timesteps_per_batch=4000, # Change time step per batch to be more reasonable number_trajs=20, # change from 10 to 1 for evaluation stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) #cmp_logdir = osp.join(args.log_dir, task_name) if MPI.COMM_WORLD.Get_rank() == 0: #writer = SummaryWriter(comment=task_name) writer = tf.summary.FileWriter(args.log_dir, U.get_session().graph) else: writer = None if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name=task_name, writer=writer) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) import MujocoManip as MM if args.task == 'train': env_name, user_name = osp.basename( args.expert_path).split('.')[0].split('_') else: env_name, user_name = osp.basename(args.load_model_path).split('.')[:2] wrapper = '%sWrapper' % env_name render = True if args.task == 'evaluate' else False if env_name == 'SawyerLiftEnv': env = MM.make(wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, has_offscreen_renderer=render) elif env_name == 'SawyerBinsEnv': env = MM.make( wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, single_object_mode=False if 'hard' in user_name.lower() else True, has_offscreen_renderer=render) elif env_name == 'SawyerPegsEnv': env = MM.make( wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, single_object_mode=False if 'hard' in user_name.lower() else True, has_offscreen_renderer=render) else: raise NotImplementedError def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(env_name, user_name) + '_%s_%s' % ( args.algo, 1 if not args.mix_reward else args.rew_lambda) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) os.makedirs(args.log_dir, exist_ok=True) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.rew_lambda, args.mix_reward, task_name, args.frame_stack) elif args.task == 'evaluate': visualizer(env, policy_fn, args.load_model_path, timesteps_per_batch=env.env.horizon, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env) # wrap in the gym environment #task = 'train' task = 'evaluate' # Expert Path expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz' # path for 100 trajectories #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) # Note: taking away the bench monitor wrapping allows rendering #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) logger.log("log_directories: ", args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space #------- Run policy for reaching ---------# play_env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 ) play_env = GymWrapper(play_env) #Weights are loaded from reach model grasp_strange #play_env.viewer.set_camera(camera_id=2) # Switch views for eval # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False) # Hack for loading policies using tensorflow init_op = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) # Load Checkpoint ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path) saver.restore(sess, ckpt.model_checkpoint_path) # Create the playback environment _, _, last_ob, last_jpos = runner_1_traj( play_env, pi_reach, None, timesteps_per_batch=3500, number_trajs=1, stochastic_policy=args.stochastic_policy, save=False) if task == 'train': play_env.close() dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train_grasp(env, last_ob, last_jpos, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif task == 'evaluate': pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False) saver_2 = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2) saver_2.restore(sess, ckpt_2.model_checkpoint_path) tt = 0 ob = last_ob while True: ac, vpred = pi_grasp.act(False, ob) ob, rew, new, _ = play_env.step(ac) play_env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= args.traj_limitation: break tt += 1 play_env.close() env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) logger.configure() env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': from baselines.gail import mlp_policy def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) if args.states_only: reward_giver = WeakTransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) else: reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name, args.states_only ) elif args.task == 'evaluate': from baselines.gail import mlp_policy def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=args.traj_limitation, stochastic_policy=args.stochastic_policy, save=args.save_sample ) elif args.task == 'expert_train': from baselines.trpo_mpi import trpo_mpi as original_trpo from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy def policy_fn(name, ob_space, ac_space, reuse=False): return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2) original_trpo.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) saver = tf.train.Saver() saver.save(tf.get_default_session(), args.save_model_path) elif args.task == 'expert_gen': from baselines.trpo_mpi import trpo_mpi as original_trpo from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy def policy_fn(name, ob_space, ac_space, reuse=False): return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2) runner(env, policy_fn, args.save_model_path, timesteps_per_batch=1024, number_trajs=args.traj_limitation, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def setup_and_learn(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, actor, critic, classifier, normalize_returns, normalize_observations, critic_l2_reg, classifier_l2_reg, actor_lr, critic_lr, classifier_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, fifomemory, tau=0.01, eval_env=None, callback=None, entropy_coeff=1., reward_giver=None, expert_dataset=None, g_step=4, d_step=1, d_stepsize=3e-4, max_timesteps=0, max_iters=0, timesteps_per_batch=1024, adversary_hidden_size=100, adversary_entcoeff=1e-3, task='train', expert_path=None): # TODO: max_episodes """ set up learning agent and execute training """ logger.info('Initialize policy') logger.info('noisynet implementation of DDPG') assert task == 'train' assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG_paramnoise(actor, critic, classifier, memory, fifomemory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, critic_l2_reg=critic_l2_reg, classifier_l2_reg=classifier_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, classifier_lr=classifier_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, entropy_coeff=entropy_coeff) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) logger.info('Initialize Discriminator') reward_giver = TransitionClassifier(env, adversary_hidden_size, entcoeff=adversary_entcoeff) d_adam = MpiAdam(reward_giver.get_trainable_variables()) logger.info('Load Expert Data') dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=-1) # TODO: customize logger.info('Start training') with U.single_threaded_session() as sess: # init agent agent.initialize(sess) # tf saver saver = tf.train.Saver() # finalize graph sess.graph.finalize() learn( env, agent, reward_giver, dataset, g_step, d_step, d_stepsize=d_stepsize, timesteps_per_batch=timesteps_per_batch, nb_train_steps=nb_train_steps, max_timesteps=max_timesteps, max_iters=max_iters, # TODO: max_episodes callback=callback, d_adam=d_adam, sess=sess, saver=saver)