def _setup(self, config): """ config contains: gpu_id (int) -- (default 0) use_gpu (bool) -- init_algo_algo_functions_and_log_fnames ((function, str)[]) -- each element of this list is a tuple of a function that returns the next algorithm to train and the corresponding log filename. init_algo_kwargs (dict) -- the variant to pass into the init_algo_function call. This dict is the same for all init_algo_function calls. This also means that modification to the variant will propagate to future init_algo_function calls. """ gpu_id = config.get('gpu_id', 0) use_gpu = config['use_gpu'] set_gpu_mode(use_gpu, gpu_id) logging.info('Using GPU mode={}'.format(use_gpu)) # import torch # if 'cpu' in config['resources_per_trial']: # num_threads = config['resources_per_trial']['cpu'] # torch.set_num_threads(num_threads) # logging.info('Setting {} CPU threads'.format(num_threads)) self.init_algo_functions_and_log_fnames = config['init_algo_functions_and_log_fnames'] self.init_algo_functions = [ init_func for init_func, _ in self.init_algo_functions_and_log_fnames ] self.log_fnames = [ log_fname for _, log_fname in self.init_algo_functions_and_log_fnames ] self.init_algo_kwargs = config['algo_variant'] self.cur_algo = None self.cur_algo_idx = -1 self._setup_next_algo()
def simulate_policy(args): ptu.set_gpu_mode(True) model = pickle.load(open(args.file, "rb")) # joblib.load(args.file) model.to(ptu.device) imgs = np.load(args.imgfile) import ipdb ipdb.set_trace() z = model.encode(ptu.np_to_var(imgs)) samples = model.decode(z).cpu() recon_imgs = samples.data.view(64, model.input_channels, model.imsize, model.imsize) recon_imgs = recon_imgs.cpu() grid = make_grid(recon_imgs, nrow=8) ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy() im = Image.fromarray(ndarr) im.show() # cv2.imshow('img', im) # cv2.waitKey(1) # for sample in samples: # tensor = tensor.cpu() # img = ptu.get_numpy(tensor) comparison = torch.cat([ recon_imgs, imgs, ]) save_dir = osp.join(logger.get_snapshot_dir(), 'r%d.png' % epoch) save_image(comparison.data.cpu(), save_dir, nrow=n)
def experiment(variant): if variant['multitask']: env = MultitaskFullVAEPoint2DEnv( **variant['env_kwargs']) # used point2d-conv-sweep/run1/id4 env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def make_video(args): if args.pause: import ipdb ipdb.set_trace() data = pickle.load(open(args.file, "rb")) # joblib.load(args.file) if 'policy' in data: policy = data['policy'] elif 'evaluation/policy' in data: policy = data['evaluation/policy'] else: raise AttributeError if 'env' in data: env = data['env'] elif 'evaluation/env' in data: env = data['evaluation/env'] else: raise AttributeError if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) else: ptu.set_gpu_mode(False) policy.to(ptu.device) if isinstance(env, VAEWrappedEnv): env.mode(args.mode) max_path_length = 100 observation_key = 'latent_observation' desired_goal_key = 'latent_desired_goal' rollout_function = rf.create_rollout_function( rf.multitask_rollout, observation_key=observation_key, desired_goal_key=desired_goal_key, ) env.mode(env._mode_map['video_env']) random_id = str(uuid.uuid4()).split('-')[0] dump_video( env, policy, 'rollouts_{}.mp4'.format(random_id), rollout_function, rows=3, columns=6, pad_length=0, pad_color=255, do_timer=True, horizon=max_path_length, dirname_to_save_images=None, subdirname="rollouts", imsize=48, )
def simulate_policy(args): data = pickle.load(open(args.file, "rb")) policy_key = args.policy_type + '/policy' if policy_key in data: policy = data[policy_key] else: raise Exception("No policy found in loaded dict. Keys: {}".format( data.keys())) env_key = args.env_type + '/env' if env_key in data: env = data[env_key] else: raise Exception("No environment found in loaded dict. Keys: {}".format( data.keys())) #robosuite env specific things env._wrapped_env.has_renderer = True env.reset() env.viewer.set_camera(camera_id=0) if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.enable_render: # some environments need to be reconfigured for visualization env.enable_render() if args.gpu: ptu.set_gpu_mode(True) if hasattr(policy, "to"): policy.to(ptu.device) if hasattr(env, "vae"): env.vae.to(ptu.device) if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) paths = [] while True: paths.append( deprecated_rollout( env, policy, max_path_length=args.H, render=not args.hide, )) if args.log_diagnostics: if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths, logger) for k, v in eval_util.get_generic_path_information(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def experiment(variant): ptu.set_gpu_mode(True, 0) imsize = variant['imsize'] env = ImageForkReacher2dEnv(variant["arm_goal_distance_cost_coeff"], variant["arm_object_distance_cost_coeff"], [imsize, imsize, 3], goal_object_distance_cost_coeff=variant[ "goal_object_distance_cost_coeff"], ctrl_cost_coeff=variant["ctrl_cost_coeff"]) partial_obs_size = env.obs_dim - imsize * imsize * 3 print("partial dim was " + str(partial_obs_size)) env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, **variant['cnn_params']) policy = TanhCNNGaussianPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params']) algorithm = TwinSAC(env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant["use_gpu"]: ptu.set_gpu_mode(True) beta = variant["beta"] representation_size = variant["representation_size"] m = ConvVAE(representation_size, input_channels=3) t = ConvVAETrainer(train_data, test_data, m, beta=beta) for epoch in range(1001): t.train_epoch(epoch) t.test_epoch(epoch) t.dump_samples(epoch)
def load_path(path, param_path): #check if file exists if param_path is None or not os.path.exists(param_path) or not os.path.isfile(param_path): return env.reset() #load policy # torch.load(param_path,map_location='cuda:0') data = pickle.load(open(param_path, 'rb')) e_ex = False if 'epoch' in data: e_ex = True epoch = data['epoch'] use_gpu = True gpu_id = '0' ptu.set_gpu_mode(use_gpu, gpu_id) os.environ['gpu_id'] = str(gpu_id) policy = data['evaluation/policy'].stochastic_policy policy.cuda() policy.eval() #path collector eval_path_collector = MdpPathCollector( env, MakeDeterministic(policy), sparse_reward=False, ) paths = eval_path_collector.collect_new_paths( max_path_length=250, num_steps=1000, discard_incomplete_paths=True, ) #calculate average return avg_return = 0 for i in range(len(paths)): rewards = paths[i]['rewards'] cum_rewards = np.cumsum(rewards) discounted_rewards = 0.9 ** np.arange(cum_rewards.shape[0]) discounted_rewards = discounted_rewards * cum_rewards avg_return += np.sum(discounted_rewards) if e_ex: out[path].append((epoch, avg_return/len(paths))) else: out[path].append(avg_return/len(paths))
def experiment(variant): if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data = get_data(10000) m = ConvVAE(representation_size) t = ConvVAETrainer(train_data, test_data, m, beta=beta, do_scatterplot=False) for epoch in range(101): t.train_epoch(epoch) t.test_epoch(epoch) t.dump_samples(epoch)
def experiment(variant): if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data = get_data(10000) m = ConvVAE(representation_size, input_channels=3) t = ConvVAETrainer(train_data, test_data, m, beta=beta, use_cuda=True) for epoch in range(50): t.train_epoch(epoch) t.test_epoch(epoch) t.dump_samples(epoch)
def simulate_policy(args): if args.pause: import ipdb; ipdb.set_trace() data = pickle.load(open(args.file, "rb")) # joblib.load(args.file) if 'policy' in data: policy = data['policy'] elif 'evaluation/policy' in data: policy = data['evaluation/policy'] if 'env' in data: env = data['env'] elif 'evaluation/env' in data: env = data['evaluation/env'] if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) else: ptu.set_gpu_mode(False) policy.to(ptu.device) if isinstance(env, VAEWrappedEnv): env.mode(args.mode) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() if args.multitaskpause: env.pause_on_goal = True if isinstance(policy, PyTorchModule): policy.train(False) paths = [] while True: paths.append(multitask_rollout( env, policy, max_path_length=args.H, render=not args.hide, observation_key=data.get('evaluation/observation_key', 'observation'), desired_goal_key=data.get('evaluation/desired_goal_key', 'desired_goal'), )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): for k, v in env.get_diagnostics(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def simulate_policy(args): dir = args.path data = joblib.load("{}/params.pkl".format(dir)) env = data['env'] model_params = data['model_params'] mpc_params = data['mpc_params'] # dyn_model = NNDynamicsModel(env=env, **model_params) # mpc_controller = MPCcontroller(env=env, # dyn_model=dyn_model, # **mpc_params) tf_path_meta = "{}/tf_out-0.meta".format(dir) tf_path = "{}/tf_out-0".format(dir) with tf.Session() as sess: new_saver = tf.train.import_meta_graph(tf_path_meta) new_saver.restore(sess, tf_path) env = data['env'] if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.to(ptu.device) if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) while True: try: path = rollout( env, policy, max_path_length=args.H, animated=True, ) env.log_diagnostics([path]) policy.log_diagnostics([path]) logger.dump_tabular() # Hack for now. Not sure why rollout assumes that close is an # keyword argument except TypeError as e: if (str(e) != "render() got an unexpected keyword " "argument 'close'"): raise e
def experiment(variant): if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data = get_data(10000) m = ConvVAE(representation_size, input_channels=3) t = ConvVAETrainer(train_data, test_data, m, beta_schedule=PiecewiseLinearSchedule([0, 400, 800], [0.5, 0.5, beta])) for epoch in range(1001): t.train_epoch(epoch) t.test_epoch(epoch) t.dump_samples(epoch)
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu ptu.set_gpu_mode(True) info = dict() logger.save_extra_data(info) logger.get_snapshot_dir() net = CNN(**variant['cnn_kwargs']) net.cuda() num_divisions = variant['num_divisions'] images = np.zeros((num_divisions * 10000, 21168)) states = np.zeros((num_divisions * 10000, 7)) for i in range(num_divisions): imgs = np.load( '/home/murtaza/vae_data/sawyer_torque_control_images100000_' + str(i + 1) + '.npy') state = np.load( '/home/murtaza/vae_data/sawyer_torque_control_states100000_' + str(i + 1) + '.npy')[:, :7] % (2 * np.pi) images[i * 10000:(i + 1) * 10000] = imgs states[i * 10000:(i + 1) * 10000] = state print(i) if variant['normalize']: std = np.std(states, axis=0) mu = np.mean(states, axis=0) states = np.divide((states - mu), std) print(mu, std) mid = int(num_divisions * 10000 * .9) train_images, test_images = images[:mid], images[mid:] train_labels, test_labels = states[:mid], states[mid:] algo = SupervisedAlgorithm(train_images, test_images, train_labels, test_labels, net, batch_size=variant['batch_size'], lr=variant['lr'], weight_decay=variant['weight_decay']) for epoch in range(variant['num_epochs']): algo.train_epoch(epoch) algo.test_epoch(epoch)
def simulate_policy(args): ptu.set_gpu_mode(True) model = pickle.load(open(args.file, "rb")) # joblib.load(args.file) model.to(ptu.device) import ipdb ipdb.set_trace() samples = ptu.Variable(torch.randn(64, model.representation_size)) samples = model.decode(samples).cpu() # for sample in samples: # tensor = sample.data.view(64, model.input_channels, model.imsize, model.imsize) # tensor = tensor.cpu() # img = ptu.get_numpy(tensor) # cv2.imshow('img', img.reshape(3, 84, 84).transpose()) # cv2.waitKey(1) tensor = samples.data.view(64, model.input_channels, model.imsize, model.imsize) tensor = tensor.cpu() grid = make_grid(tensor, nrow=8) ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy() im = Image.fromarray(ndarr) im.show()
def setup_experiment( variant, exp_name, base_log_dir, git_infos, script_name, use_gpu, gpu_id, ): logger_config = variant.get('logger_config', {}) seed = variant.get('seed', random.randint(0, 999999)) exp_id = variant.get('exp_id', random.randint(0, 999999)) set_seed(seed) ptu.set_gpu_mode(use_gpu, gpu_id) os.environ['gpu_id'] = str(gpu_id) setup_logger(logger, exp_name=exp_name, base_log_dir=base_log_dir, variant=variant, git_infos=git_infos, script_name=script_name, seed=seed, exp_id=exp_id, **logger_config)
def experiment(variant): ptu.set_gpu_mode(True, 0) from softlearning.environments.gym import register_image_reach register_image_reach() env = gym.make('Pusher2d-ImageReach-v0', arm_goal_distance_cost_coeff=1.0, arm_object_distance_cost_coeff=0.0) #import ipdb; ipdb.set_trace() input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor( obs_processor=qf1_cnn, output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf1 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) action_dim = int(np.prod(env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter( policy_cnn, policy_cnn.conv_output_flat_size, action_dim, ) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'parallel': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchParallelRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def main(): ptu.set_gpu_mode(True) obs_dim = 1 action_dim = 1 batch_size = 100 model = NAF(obs_dim, action_dim) # model = SeparateDuelingFF(obs_dim, action_dim) # model = ConcatFF(obs_dim, action_dim) # model = OuterProductFF(obs_dim, action_dim) version = model.__class__.__name__ version = "NAF-P-depends-on-embedded" optimizer = optim.SGD(model.parameters(), lr=1e-7, momentum=0.5) loss_fnct = nn.MSELoss() num_batches_per_print = 100 train_size = 100000 test_size = 10000 state_bounds = (-10, 10) action_bounds = (-10, 10) resolution = 20 base_dir = Path( "/home/vitchyr/git/rllab-rail/railrl/data/one-offs/polynomial-nn") base_dir = base_dir / version if not base_dir.exists(): base_dir.mkdir() report_path = str(base_dir / "report.html") report = HTMLReport(report_path, images_per_row=2) print("Saving report to: {}".format(report_path)) train_loader = data.DataLoader(FakeDataset(obs_dim, action_dim, train_size, state_bounds, action_bounds), batch_size=batch_size, shuffle=True) test_loader = data.DataLoader(FakeDataset(obs_dim, action_dim, test_size, state_bounds, action_bounds), batch_size=batch_size, shuffle=True) model.to(ptu.device) def eval_model(state, action): state = ptu.Variable(state, requires_grad=False) action = ptu.Variable(action, requires_grad=False) a, v = model(state, action) return a + v def train(epoch): for batch_idx, (state, action, q_target) in enumerate(train_loader): q_estim = eval_model(state, action) q_target = ptu.Variable(q_target, requires_grad=False) loss = loss_fnct(q_estim, q_target) optimizer.zero_grad() loss.backward() optimizer.step() if batch_idx % num_batches_per_print == 0: line_logger.print_over( 'Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format( epoch, batch_size * batch_idx, train_size, loss.data[0])) def test(epoch): test_losses = [] for state, action, q_target in test_loader: q_estim = eval_model(state, action) q_target = ptu.Variable(q_target, requires_grad=False) loss = loss_fnct(q_estim, q_target) test_losses.append(loss.data[0]) line_logger.newline() print('Test Epoch: {0}. Loss: {1}'.format(epoch, np.mean(test_losses))) report.add_header("Epoch = {}".format(epoch)) fig = visualize_model(q_function, "True Q Function") img = vu.save_image(fig) report.add_image(img, txt='True Q Function') fig = visualize_model(eval_model_np, "Estimated Q Function") img = vu.save_image(fig) report.add_image(img, txt='Estimated Q Function') report.new_row() def eval_model_np(state, action): state = ptu.Variable(ptu.FloatTensor([[state]]), requires_grad=False) action = ptu.Variable(ptu.FloatTensor([[action]]), requires_grad=False) a, v = model(state, action) q = a + v return ptu.get_numpy(q)[0] def visualize_model(eval, title): fig = plt.figure() ax = plt.gca() heatmap = vu.make_heat_map( eval, x_bounds=state_bounds, y_bounds=action_bounds, resolution=resolution, ) vu.plot_heatmap(heatmap, fig, ax) ax.set_xlabel("State") ax.set_ylabel("Action") ax.set_title(title) return fig for epoch in range(0, 10): model.train() train(epoch) model.eval() test(epoch) print("Report saved to: {}".format(report_path))
def build_env(env_id): ptu.set_gpu_mode(True) env = RLBenchEnv( task_class=OpenDrawer, fixed_goal=(), headless=False, camera=(500, 300), state_observation_type="task", stub=False, ) env = ImageEnv(env, recompute_reward=False, transpose=True, image_length=450000, reward_type="image_distance", # init_camera=sawyer_pusher_camera_upright_v2, ) variant = dict( model_path="/home/ashvin/data/s3doodad/facebook/models/rfeatures/multitask1/run2/id2/itr_4000.pt", desired_trajectory="/home/ashvin/code/railrl-private/gitignore/rlbench/demo_door_fixed2/demos5b_10_dict.npy", model_kwargs=dict( decoder_distribution='gaussian_identity_variance', input_channels=3, imsize=224, architecture=dict( hidden_sizes=[200, 200], ), delta_features=True, pretrained_features=False, ), reward_params_type="regression_distance", ) model_class = variant.get('model_class', TimestepPredictionModel) representation_size = 128 output_classes = 20 model = model_class( representation_size, # decoder_output_activation=decoder_activation, output_classes=output_classes, **variant['model_kwargs'], ) # model = torch.nn.DataParallel(model) model_path = variant.get("model_path") # model = load_local_or_remote_file(model_path) state_dict = torch.load(model_path) model.load_state_dict(state_dict) model.to(ptu.device) model.eval() traj = np.load(variant.get("desired_trajectory"), allow_pickle=True)[0] goal_image = traj["observations"][-1]["image_observation"] goal_image = goal_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # goal_image = goal_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # BECAUSE RLBENCH DEMOS ARENT IMAGE_ENV WRAPPED # goal_image = goal_image[:, :, :240, 60:500] goal_image = goal_image[:, :, 60:, 60:500] goal_image_pt = ptu.from_numpy(goal_image) # save_image(goal_image_pt.data.cpu(), 'demos/goal.png', nrow=1) goal_latent = model.encode(goal_image_pt).detach().cpu().numpy().flatten() initial_image = traj["observations"][0]["image_observation"] initial_image = initial_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # initial_image = initial_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # initial_image = initial_image[:, :, :240, 60:500] initial_image = initial_image[:, :, 60:, 60:500] initial_image_pt = ptu.from_numpy(initial_image) # save_image(initial_image_pt.data.cpu(), 'demos/initial.png', nrow=1) initial_latent = model.encode(initial_image_pt).detach().cpu().numpy().flatten() # Move these to td3_bc and bc_v3 (or at least type for reward_params) reward_params = dict( goal_latent=goal_latent, initial_latent=initial_latent, type=variant["reward_params_type"], ) config_params = variant.get("config_params") env = EncoderWrappedEnv( env, model, reward_params, config_params, **variant.get("encoder_wrapped_env_kwargs", dict()) ) env = FlatGoalEnv(env, obs_keys=["state_observation", ]) return env
""" Fine tune a trained policy/qf """ import argparse import joblib import torch import railrl.torch.pytorch_util as ptu if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('path', type=str, help='Path to snapshot file to fine tune.') args = parser.parse_args() ptu.set_gpu_mode(True) data = torch.load(args.path, "cuda") algo = data['algorithm'] # algo.to("cpu") # algo.to("cuda") algo.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--H', type=int, default=300, help='Max length of rollout') parser.add_argument('--nrolls', type=int, default=1, help='Number of rollout per eval') parser.add_argument('--verbose', action='store_true') parser.add_argument('--mtau', type=float, help='Max tau value') parser.add_argument('--grid', action='store_true') parser.add_argument('--gpu', action='store_true') parser.add_argument('--load', action='store_true') parser.add_argument('--hide', action='store_true') parser.add_argument('--pause', action='store_true') parser.add_argument('--cycle', help='cycle tau', action='store_true') args = parser.parse_args() data = joblib.load(args.file) env = data['env'] if 'policy' in data: policy = data['policy'] else: policy = data['exploration_policy'] qf = data['qf'] policy.train(False) qf.train(False) if args.pause: import ipdb ipdb.set_trace() if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) if args.mtau is None: print("Defaulting max tau to 10.") max_tau = 10 else: max_tau = args.mtau while True: paths = [] for _ in range(args.nrolls): goal = env.sample_goal_for_rollout() print("goal", goal) env.set_goal(goal) policy.set_goal(goal) policy.set_tau(max_tau) path = rollout( env, policy, qf, init_tau=max_tau, max_path_length=args.H, animated=not args.hide, cycle_tau=args.cycle, ) paths.append(path) env.log_diagnostics(paths) for key, value in get_generic_path_information(paths).items(): logger.record_tabular(key, value) logger.dump_tabular()
add_demo_latents=True, bc_num_pretrain_steps=100, ), replay_buffer_kwargs=dict( max_size=100000, fraction_goals_rollout_goals=1.0, fraction_goals_env_goals=0.0, ), qf_kwargs=dict(hidden_sizes=[400, 300], ), policy_kwargs=dict(hidden_sizes=[400, 300], ), save_video=True, dump_video_kwargs=dict(save_period=1, # imsize=(3, 500, 300), )) ptu.set_gpu_mode("gpu") representation_size = 128 output_classes = 20 model_class = variant.get('model_class', TimestepPredictionModel) model = model_class( representation_size, # decoder_output_activation=decoder_activation, output_classes=output_classes, **variant['model_kwargs'], ) # model = torch.nn.DataParallel(model) imagenets = [True, False] reg_types = ["regression_distance", "latent_distance"]
def simulate_policy(args): data = joblib.load(args.file) if 'eval_policy' in data: policy = data['eval_policy'] elif 'policy' in data: policy = data['policy'] elif 'exploration_policy' in data: policy = data['exploration_policy'] else: raise Exception("No policy found in loaded dict. Keys: {}".format( data.keys())) env = data['env'] env.mode("video_env") env.decode_goals = True if hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() if args.gpu: set_gpu_mode(True) policy.to(ptu.device) if hasattr(env, "vae"): env.vae.to(ptu.device) else: # make sure everything is on the CPU set_gpu_mode(False) policy.cpu() if hasattr(env, "vae"): env.vae.cpu() if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) ROWS = 3 COLUMNS = 6 dirname = osp.dirname(args.file) input_file_name = os.path.splitext(os.path.basename(args.file))[0] filename = osp.join(dirname, "video_{}.mp4".format(input_file_name)) rollout_function = create_rollout_function( multitask_rollout, observation_key='observation', desired_goal_key='desired_goal', ) paths = dump_video( env, policy, filename, rollout_function, ROWS=ROWS, COLUMNS=COLUMNS, horizon=args.H, dirname_to_save_images=dirname, subdirname="rollouts_" + input_file_name, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) logger.dump_tabular()
def simulate_policy(args): data = joblib.load(args.file) if 'eval_policy' in data: policy = data['eval_policy'] elif 'policy' in data: policy = data['policy'] elif 'exploration_policy' in data: policy = data['exploration_policy'] elif 'naf_policy' in data: policy = data['naf_policy'] elif 'optimizable_qfunction' in data: qf = data['optimizable_qfunction'] policy = qf.implicit_policy else: raise Exception("No policy found in loaded dict. Keys: {}".format( data.keys())) env = data['env'] if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") env.mode("video_env") env.decode_goals = True image_env = ImageMujocoEnv( env._wrapped_env._wrapped_env, 84, init_camera=None, camera_name="topview", transpose=True, normalize=True, ) # env.image_env = image_env if args.enable_render: # some environments need to be reconfigured for visualization env.enable_render() if args.gpu: set_gpu_mode(True) policy.to(ptu.device) if hasattr(env, "vae"): env.vae.to(ptu.device) else: # make sure everything is on the CPU set_gpu_mode(False) policy.cpu() if hasattr(env, "vae"): env.vae.cpu() if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) ROWS = 3 COLUMNS = 6 dirname = osp.dirname(args.file) input_file_name = os.path.splitext(os.path.basename(args.file))[0] filename = osp.join(dirname, "video_{}.mp4".format(input_file_name)) paths = dump_video( env, policy, filename, ROWS=ROWS, COLUMNS=COLUMNS, horizon=args.H, image_env=image_env, dirname=dirname, subdirname="rollouts_" + input_file_name, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) logger.dump_tabular()
def run_experiment_here( experiment_function, variant=None, exp_id=0, seed=0, use_gpu=True, # Logger params: exp_prefix="default", snapshot_mode='last', snapshot_gap=1, git_infos=None, script_name=None, logger=default_logger, trial_dir_suffix=None, randomize_seed=False, **setup_logger_kwargs): """ Run an experiment locally without any serialization. :param experiment_function: Function. `variant` will be passed in as its only argument. :param exp_prefix: Experiment prefix for the save file. :param variant: Dictionary passed in to `experiment_function`. :param exp_id: Experiment ID. Should be unique across all experiments. Note that one experiment may correspond to multiple seeds,. :param seed: Seed used for this experiment. :param use_gpu: Run with GPU. By default False. :param script_name: Name of the running script :param log_dir: If set, set the log directory to this. Otherwise, the directory will be auto-generated based on the exp_prefix. :return: """ if variant is None: variant = {} variant['exp_id'] = str(exp_id) if randomize_seed or (seed is None and 'seed' not in variant): seed = random.randint(0, 100000) variant['seed'] = str(seed) reset_execution_environment(logger=logger) actual_log_dir = setup_logger(exp_prefix=exp_prefix, variant=variant, exp_id=exp_id, seed=seed, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, git_infos=git_infos, script_name=script_name, logger=logger, trial_dir_suffix=trial_dir_suffix, **setup_logger_kwargs) set_seed(seed) from railrl.torch.pytorch_util import set_gpu_mode set_gpu_mode(use_gpu) run_experiment_here_kwargs = dict(variant=variant, exp_id=exp_id, seed=seed, use_gpu=use_gpu, exp_prefix=exp_prefix, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, git_infos=git_infos, script_name=script_name, **setup_logger_kwargs) save_experiment_data( dict(run_experiment_here_kwargs=run_experiment_here_kwargs), actual_log_dir) return experiment_function(variant)
parser.add_argument('--num_rollouts', type=int, default=5, help='Number of rollouts per eval') parser.add_argument('--discount', type=float, help='Discount Factor') parser.add_argument('--gpu', action='store_true') parser.add_argument('--hide', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() data = joblib.load(args.file) env = data['env'] print("Environment Type = ", type(env)) qf = data['qf'] if args.gpu: set_gpu_mode(True) qf.to(ptu.device) qf.train(False) if 'discount' in data: discount = data['discount'] if args.discount is not None: print("WARNING: you are overriding the saved discount factor.") discount = args.discount else: discount = args.discount num_samples = 1000 policy = SamplePolicyPartialOptimizer(qf, env, num_samples) policy.set_tau(discount)
def experiment(variant): rdim = variant["rdim"] vae_paths = { 2: "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id1/params.pkl", 4: "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id4/params.pkl" } vae_path = vae_paths[rdim] vae = joblib.load(vae_path) print("loaded", vae_path) if variant['multitask']: env = MultitaskImagePoint2DEnv(**variant['env_kwargs']) env = VAEWrappedEnv(env, vae, use_vae_obs=True, use_vae_reward=False, use_vae_goals=False) env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, training_env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"])) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def experiment(variant): rdim = variant["rdim"] vae_paths = { 2: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id0/params.pkl", 4: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id1/params.pkl", 8: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id2/params.pkl", 16: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id3/params.pkl" } vae_path = vae_paths[rdim] vae = torch.load(vae_path) print("loaded", vae_path) if variant['multitask']: env = FullPusher2DEnv(**variant["env_kwargs"]) env = ImageMujocoEnv(env, 84, camera_name="topview", transpose=True, normalize=True) env = VAEWrappedImageGoalEnv(env, vae, use_vae_obs=True, use_vae_reward=True, use_vae_goals=True, render_goals=True, render_rollouts=True, track_qpos_goal=5) env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, training_env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"])) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()