def experiment(variant): ptu.set_gpu_mode(True, 0) imsize = variant['imsize'] env = ImageForkReacher2dEnv(variant["arm_goal_distance_cost_coeff"], variant["arm_object_distance_cost_coeff"], [imsize, imsize, 3], goal_object_distance_cost_coeff=variant[ "goal_object_distance_cost_coeff"], ctrl_cost_coeff=variant["ctrl_cost_coeff"]) partial_obs_size = env.obs_dim - imsize * imsize * 3 print("partial dim was " + str(partial_obs_size)) env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, **variant['cnn_params']) policy = TanhCNNGaussianPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params']) algorithm = TwinSAC(env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = Pusher2DEnv()#gym.make(variant['env_id']).env env = NormalizedBoxEnv(ImageMujocoEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera'])) # es = GaussianStrategy( # action_space=env.action_space, # ) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels= history, added_fc_input_size=action_dim, **variant['cnn_params']) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, **variant['cnn_params']) policy = TanhCNNGaussianPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=history, **variant['cnn_params'], ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu ptu.set_gpu_mode(True) info = dict() logger.save_extra_data(info) logger.get_snapshot_dir() net = CNN(**variant['cnn_kwargs']) net.cuda() num_divisions = variant['num_divisions'] images = np.zeros((num_divisions * 10000, 21168)) states = np.zeros((num_divisions * 10000, 7)) for i in range(num_divisions): imgs = np.load( '/home/murtaza/vae_data/sawyer_torque_control_images100000_' + str(i + 1) + '.npy') state = np.load( '/home/murtaza/vae_data/sawyer_torque_control_states100000_' + str(i + 1) + '.npy')[:, :7] % (2 * np.pi) images[i * 10000:(i + 1) * 10000] = imgs states[i * 10000:(i + 1) * 10000] = state print(i) if variant['normalize']: std = np.std(states, axis=0) mu = np.mean(states, axis=0) states = np.divide((states - mu), std) print(mu, std) mid = int(num_divisions * 10000 * .9) train_images, test_images = images[:mid], images[mid:] train_labels, test_labels = states[:mid], states[mid:] algo = SupervisedAlgorithm(train_images, test_images, train_labels, test_labels, net, batch_size=variant['batch_size'], lr=variant['lr'], weight_decay=variant['weight_decay']) for epoch in range(variant['num_epochs']): algo.train_epoch(epoch) algo.test_epoch(epoch)
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = gym.make(variant['env_id']).env training_env = gym.make(variant['env_id']).env env = NormalizedBoxEnv(env) training_env = NormalizedBoxEnv(training_env) env = ImageMujocoEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera']) training_env = ImageMujocoEnv(training_env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera']) env = DiscretizeEnv(env, variant['bins']) training_env = DiscretizeEnv(training_env, variant['bins']) qf = CNN(output_size=env.action_space.n, input_width=imsize, input_height=imsize, input_channels=history, **variant['cnn_params']) qf_criterion = variant['qf_criterion_class']() algorithm = variant['algo_class'](env, training_env=training_env, qf=qf, qf_criterion=qf_criterion, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import multiworld.envs.pygame env = gym.make('Point2DEnv-ImageFixedGoal-v0') input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor( obs_processor=qf1_cnn, output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf1 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) action_dim = int(np.prod(env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter( policy_cnn, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs'] ) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #expl_env = carla_env.CarlaObsDictEnv(args=variant['env_args']) import gym import d4rl.carla expl_env = gym.make('carla-lane-dict-v0') eval_env = expl_env #num_channels, img_width, img_height = eval_env._wrapped_env.image_shape num_channels, img_width, img_height = eval_env.image_shape # num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) # obs_dim = 11 cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) observation_key = 'image' eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) expl_path_collector = CustomObsDictPathCollector( expl_env, observation_key=observation_key, ) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) load_hdf5(expl_env, replay_buffer) #load_buffer(buffer_path=variant['buffer'], replay_buffer=replay_buffer) # import ipdb; ipdb.set_trace() trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant['trainer_kwargs']) variant['algo_kwargs']['max_path_length'] = expl_env._max_episode_steps algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=True, **variant['algorithm_kwargs']) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # from softlearning.environments.gym import register_image_reach # register_image_reach() # env = gym.envs.make( # 'Pusher2d-ImageReach-v0', # ) from softlearning.environments.gym.mujoco.image_pusher_2d import ( ImageForkReacher2dEnv) env_kwargs = { 'image_shape': (32, 32, 3), 'arm_goal_distance_cost_coeff': 1.0, 'arm_object_distance_cost_coeff': 0.0, } eval_env = ImageForkReacher2dEnv(**env_kwargs) expl_env = ImageForkReacher2dEnv(**env_kwargs) input_width, input_height, input_channels = eval_env.image_shape image_dim = input_width * input_height * input_channels action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=input_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( Split(qf_cnn, identity, image_dim), FlattenEach(), Concat(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size + non_image_dim) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( Split(target_qf_cnn, identity, image_dim), FlattenEach(), Concat(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size + non_image_dim) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn, output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( Split(policy_cnn, identity, image_dim), FlattenEach(), Concat(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = ( action_dim + qf_cnn.conv_output_flat_size ) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = ( action_dim + target_qf_cnn.conv_output_flat_size ) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = GaussianMixtureObsProcessorPolicy( obs_dim=policy_cnn.conv_output_flat_size, action_dim=action_dim, obs_processor=policy_obs_processor, **variant['policy_kwargs'] ) buffer_policy = GaussianMixtureObsProcessorPolicy( obs_dim=policy_cnn.conv_output_flat_size, action_dim=action_dim, obs_processor=policy_obs_processor, **variant['policy_kwargs'] ) # policy = TanhGaussianPolicyAdapter( # policy_obs_processor, # policy_cnn.conv_output_flat_size, # action_dim, # **variant['policy_kwargs'] # ) # buffer_policy = TanhGaussianPolicyAdapter( # policy_obs_processor, # policy_cnn.conv_output_flat_size, # action_dim, # **variant['policy_kwargs'] # ) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) trainer = AWRSACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant['trainer_kwargs'] ) expl_policy = policy expl_path_collector = ObsDictPathCollector( expl_env, expl_policy, observation_key=observation_key, **variant['expl_path_collector_kwargs'] ) eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) demo_train_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) demo_test_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) path_loader_kwargs = variant.get("path_loader_kwargs", {}) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) save_paths = None # FIXME(avi) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get('train_rl', True): algorithm.train()
def experiment(variant): ptu.set_gpu_mode(True, 0) from softlearning.environments.gym import register_image_reach register_image_reach() env = gym.make('Pusher2d-ImageReach-v0', arm_goal_distance_cost_coeff=1.0, arm_object_distance_cost_coeff=0.0) #import ipdb; ipdb.set_trace() input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor( obs_processor=qf1_cnn, output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf1 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) action_dim = int(np.prod(env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter( policy_cnn, policy_cnn.conv_output_flat_size, action_dim, ) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'parallel': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchParallelRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multiworld.envs.mujoco import register_goal_example_envs register_goal_example_envs() eval_env = gym.make('Image48SawyerPushForwardEnv-v0') expl_env = gym.make('Image48SawyerPushForwardEnv-v0') # Hack for now eval_env.wrapped_env.transpose = True expl_env.wrapped_env.transpose = True # More hacks, use a dense reward instead eval_env.wrapped_env.wrapped_env.reward_type = 'puck_distance' expl_env.wrapped_env.wrapped_env.reward_type = 'puck_distance' img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import multiworld multiworld.register_all_envs() env = gym.make('Image48SawyerReachXYEnv-v1') observation_key = 'image_proprio_observation' input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, added_fc_input_size=3, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(qf_cnn, Flatten()), output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(qf_cnn, Flatten()), output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(target_qf_cnn, Flatten()), output_size=1, input_size=action_dim + target_qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(target_qf_cnn, Flatten()), output_size=1, input_size=action_dim + target_qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=nn.Sequential( qf1_cnn, Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=nn.Sequential( CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter(nn.Sequential(policy_cnn, Flatten()), policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, **variant['replay_buffer_kwargs'], ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = ObsDictStepCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def HER_baseline_twin_sac_experiment(variant): import railrl.torch.pytorch_util as ptu from railrl.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from railrl.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from railrl.torch.her.her_twin_sac import HerTwinSAC from railrl.torch.sac.policies import TanhCNNGaussianPolicy from railrl.torch.networks import MergedCNN, CNN import torch from multiworld.core.image_env import ImageEnv from railrl.misc.asset_loader import load_local_or_remote_file init_camera = variant.get("init_camera", None) presample_goals = variant.get('presample_goals', False) presampled_goals_path = get_presampled_goals_path( variant.get('presampled_goals_path', None)) if 'env_id' in variant: import gym import multiworld multiworld.register_all_envs() env = gym.make(variant['env_id']) else: env = variant["env_class"](**variant['env_kwargs']) image_env = ImageEnv( env, variant.get('imsize'), reward_type='image_sparse', init_camera=init_camera, transpose=True, normalize=True, ) if presample_goals: if presampled_goals_path is None: image_env.non_presampled_goal_img_is_garbage = True presampled_goals = variant['generate_goal_dataset_fctn']( env=image_env, **variant['goal_generation_kwargs']) else: presampled_goals = load_local_or_remote_file( presampled_goals_path).item() del image_env env = ImageEnv( env, variant.get('imsize'), reward_type='image_distance', init_camera=init_camera, transpose=True, normalize=True, presampled_goals=presampled_goals, ) else: env = image_env es = get_exploration_strategy(variant, env) observation_key = variant.get('observation_key', 'image_observation') desired_goal_key = variant.get('desired_goal_key', 'image_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") imsize = variant['imsize'] action_dim = env.action_space.low.size qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, added_fc_input_size=action_dim, **variant['cnn_params']) policy = TanhCNNGaussianPolicy( input_width=imsize, input_height=imsize, added_fc_input_size=0, output_size=action_dim, input_channels=3 * 2, output_activation=torch.tanh, **variant['cnn_params'], ) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, **variant['cnn_params']) target_vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, **variant['cnn_params']) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algo_kwargs = variant['algo_kwargs'] algo_kwargs['replay_buffer'] = replay_buffer base_kwargs = algo_kwargs['base_kwargs'] base_kwargs['training_env'] = env base_kwargs['render'] = variant["render"] base_kwargs['render_during_eval'] = variant["render"] her_kwargs = algo_kwargs['her_kwargs'] her_kwargs['observation_key'] = observation_key her_kwargs['desired_goal_key'] = desired_goal_key algorithm = HerTwinSAC(env, qf1=qf1, qf2=qf2, vf=vf, target_vf=target_vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) # obs_dim = 11 cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) observation_key = 'image' eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) expl_path_collector = CustomObsDictPathCollector( expl_env, observation_key=observation_key, ) with open(variant['buffer'], 'rb') as f: replay_buffer = pickle.load(f) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=variant['randomize_env'], observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) if variant['obs'] == 'pixels_debug': robot_state_dims = 11 elif variant['obs'] == 'pixels': robot_state_dims = 4 else: raise NotImplementedError expl_env = FlatEnv(expl_env, use_robot_state=variant['use_robot_state'], robot_state_dims=robot_state_dims) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, ) if variant['use_robot_state']: cnn_params.update( added_fc_input_size=robot_state_dims, output_conv_channels=False, hidden_sizes=[400, 400], output_size=200, ) else: cnn_params.update( added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) if variant['use_robot_state']: qf_obs_processor = qf_cnn else: qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 if variant['use_robot_state']: qf_kwargs['input_size'] = (action_dim + qf_cnn.output_size) else: qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) if variant['use_robot_state']: target_qf_obs_processor = target_qf_cnn else: target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 if variant['use_robot_state']: target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.output_size) else: target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) if variant['use_robot_state']: policy_obs_processor = policy_cnn else: policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) if variant['use_robot_state']: policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.output_size, action_dim, **variant['policy_kwargs']) else: policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) else: raise NotImplementedError video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = FlatEnv(PointmassBaseEnv(observation_mode='pixels', transpose_image=True), use_robot_state=False) eval_env = expl_env img_width, img_height = (48, 48) num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('carla-lane-dict-v0') eval_env = expl_env num_channels, img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = ( action_dim + qf_cnn.conv_output_flat_size ) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = ( action_dim + target_qf_cnn.conv_output_flat_size ) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs'] ) cnn_vae_params = variant['cnn_vae_params'] cnn_vae_params['conv_args'].update( input_width=img_width, input_height=img_height, input_channels=num_channels, ) vae_policy = ConvVAEPolicy( representation_size=cnn_vae_params['representation_size'], architecture=cnn_vae_params, action_dim=action_dim, input_channels=3, imsize=img_width, ) observation_key = 'image' eval_path_collector = CustomObsDictPathCollector( eval_env, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) vae_eval_path_collector = CustomObsDictPathCollector( eval_env, # eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) #with open(variant['buffer'], 'rb') as f: # replay_buffer = pickle.load(f) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) load_hdf5(expl_env, replay_buffer) trainer = BEARTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs'] ) expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, vae_evaluation_data_collector=vae_eval_path_collector, replay_buffer=replay_buffer, q_learning_alg=True, batch_rl=variant['batch_rl'], **variant['algo_kwargs'] ) video_func = VideoSaveFunctionBullet(variant) # dump_buffer_func = BufferSaveFunction(variant) algorithm.post_train_funcs.append(video_func) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = dict( block_random=0.3, camera_random=0, simple_observations=False, continuous=True, remove_height_hack=True, render_mode="DIRECT", # render_mode="GUI", num_objects=5, max_num_training_models=900, target=False, test=False, ) expl_env = FlatEnv(KukaGraspingProceduralEnv(**env_params)) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) observation_key = 'image' eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = ObsDictStepCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) else: raise NotImplementedError video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) # dump_buffer_func = BufferSaveFunction(variant) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()