algorithm.to(ptu.device) algorithm.train() return 1 if __name__ == '__main__': # Arguments parser = argparse.ArgumentParser() parser.add_argument('-e', '--experiment', help='experiment specification file') parser.add_argument('-g', '--gpu', help='gpu id', type=int, default=0) args = parser.parse_args() with open(args.experiment, 'r') as spec_file: spec_string = spec_file.read() exp_specs = yaml.load(spec_string) # make all seeds the same. exp_specs['env_specs']['eval_env_seed'] = exp_specs['env_specs']['training_env_seed'] = exp_specs['seed'] if exp_specs['num_gpu_per_worker'] > 0: print('\n\nUSING GPU\n\n') ptu.set_gpu_mode(True, args.gpu) exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] exp_prefix = exp_prefix + '--sigma-{}'.format(exp_specs['sigma']) seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs, seed=seed, snapshot_mode="all") experiment(exp_specs)
def run_experiment_here( experiment_function, variant=None, exp_id=0, seed=None, use_gpu=True, # Logger params: exp_prefix="default", snapshot_mode='last', snapshot_gap=1, git_infos=None, script_name=None, base_log_dir=None, force_randomize_seed=False, log_dir=None, **setup_logger_kwargs ): """ Run an experiment locally without any serialization. :param experiment_function: Function. `variant` will be passed in as its only argument. :param exp_prefix: Experiment prefix for the save file. :param variant: Dictionary passed in to `experiment_function`. :param exp_id: Experiment ID. Should be unique across all experiments. Note that one experiment may correspond to multiple seeds,. :param seed: Seed used for this experiment. :param use_gpu: Run with GPU. By default False. :param script_name: Name of the running script :param log_dir: If set, set the log directory to this. Otherwise, the directory will be auto-generated based on the exp_prefix. :return: """ if variant is None: variant = {} variant['exp_id'] = str(exp_id) if force_randomize_seed or seed is None: seed = random.randint(0, 100000) variant['seed'] = str(seed) reset_execution_environment() actual_log_dir = setup_logger( exp_prefix=exp_prefix, variant=variant, exp_id=exp_id, seed=seed, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, base_log_dir=base_log_dir, log_dir=log_dir, git_infos=git_infos, script_name=script_name, **setup_logger_kwargs ) set_seed(seed) set_gpu_mode(use_gpu) run_experiment_here_kwargs = dict( variant=variant, exp_id=exp_id, seed=seed, use_gpu=use_gpu, exp_prefix=exp_prefix, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, git_infos=git_infos, script_name=script_name, base_log_dir=base_log_dir, **setup_logger_kwargs ) save_experiment_data( dict( run_experiment_here_kwargs=run_experiment_here_kwargs ), actual_log_dir ) return experiment_function(variant)
**variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1 if __name__ == '__main__': # Arguments parser = argparse.ArgumentParser() parser.add_argument('-e', '--experiment', help='experiment specification file') args = parser.parse_args() with open(args.experiment, 'r') as spec_file: spec_string = spec_file.read() exp_specs = yaml.load(spec_string) if exp_specs['use_gpu']: print('\n\nUSING GPU\n\n') ptu.set_gpu_mode(True) exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) experiment(exp_specs)
def simulate_policy(args): data = joblib.load(args.file) cont = False if 'policies' in data: policy = data['policies'][0] else: policy = data['policy'] env = NormalizedBoxEnv(create_swingup()) #data['env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() data['qf1'].cuda() if isinstance(policy, PyTorchModule): policy.train(False) diayn = 'df' in data rnd = 'rf' in data if diayn: skills = len(data['eval_policy'].skill_vec) disc = data['df'] policy = OptionPolicy(policy, skills, cont) if args.gpu: disc.cuda() if isinstance(policy, PyTorchModule): disc.train(False) if rnd: data['rf'].cuda() data['pf'].cuda() data['qf1'].cuda() import cv2 video = cv2.VideoWriter('video.avi', cv2.VideoWriter_fourcc(*"H264"), 30, (640, 480)) index = 0 truth, pred = [], [] if cont: eps = 1 elif diayn: eps = skills * 2 else: eps = 5 Rs = [] for ep in range(eps): if diayn and not cont: z_index = ep // 2 policy.set_z(z_index) path = rollout( env, policy, max_path_length=args.H * skills if cont else args.H, animated=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() total_r = 0 if diayn: predictions = F.log_softmax( disc(torch.FloatTensor(path['observations']).cuda()), 1).cpu().detach().numpy() probs = predictions.max(1) labels = predictions.argmax(1) if cont: for k in range(skills): truth.extend([k] * 100) else: truth.extend([z_index] * len(labels)) pred.extend(labels.tolist()) if rnd: random_feats = data['rf'](torch.FloatTensor( path['observations']).cuda()) pred_feats = data['pf'](torch.FloatTensor( path['observations']).cuda()) i_rewards = ((random_feats - pred_feats)**2.0).sum(1).cpu().data.numpy() q_pred = data['qf1'](torch.FloatTensor(path['observations']).cuda(), torch.FloatTensor( path['actions']).cuda()).cpu().data.numpy() for i, (img, r, s) in enumerate( zip(path['images'], path['rewards'], path['observations'])): #video.write(img[:,:,::-1].astype(np.uint8)) total_r += r[0] img = img.copy() img = np.rot90(img, 3).copy() col = (255, 0, 255) cv2.putText(img, "step: %d" % (i + 1), (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) if diayn: if cont: cv2.putText(img, "z: %s" % str(truth[i]), (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) else: cv2.putText(img, "z: %s" % str(z_index), (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "disc_pred: %s (%.3f)" % (labels[i], probs[i]), (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "reward: %.3f" % r[0], (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "total reward: %.1f" % total_r, (20, 200), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "action: %s" % path['actions'][i], (20, 240), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) else: cv2.putText(img, "reward: %.1f" % r[0], (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) cv2.putText(img, "total reward: %.1f" % total_r, (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) y = 120 if rnd: cv2.putText(img, "i reward (unscaled): %.3f" % i_rewards[i], (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) #cv2.rectangle(img, (20, 180), (20 + int(q_pred[i, 0]), 200), (255, 0, 255), -1) cv2.rectangle(img, (20, 200), (20 + int(i_rewards[i] * 10), 220), (255, 255, 0), -1) y = 220 try: y += 40 cv2.putText(img, "Q: %.3f" % q_pred[i], (20, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) except: y += 40 cv2.putText(img, "Q:" + str([q for q in q_pred[i]]), (20, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) y += 40 cv2.putText(img, str(["%.3f" % x for x in path['observations'][i]]), (20, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) try: cv2.imwrite("frames/%06d.png" % index, img[:, :, ::-1]) except: cv2.imwrite("frames/%06d.png" % index, img[:, :]) index += 1 if diayn: print(z_index, ":", total_r) Rs.append(total_r) print("best", np.argmax(Rs)) print("worst", np.argmin(Rs)) video.release() print("wrote video") if diayn: import sklearn from sklearn.metrics import confusion_matrix import matplotlib as mpl import itertools mpl.use('Agg') import matplotlib.pyplot as plt normalize = False classes = range(skills) cm = confusion_matrix(truth, pred) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.colorbar() tick_marks = np.arange(skills) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) """ fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") """ plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() plt.savefig("confusion.png")
def experiment(variant): setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) log_dir = os.path.expanduser(variant["log_dir"]) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) # missing - set torch seed and num threads=1 # expl_env = gym.make(variant["env_name"]) expl_envs = make_vec_envs( variant["env_name"], variant["seed"], variant["num_processes"], variant["gamma"], variant["log_dir"], # probably change this? ptu.device, False, pytorch=False, ) # eval_env = gym.make(variant["env_name"]) eval_envs = make_vec_envs( variant["env_name"], variant["seed"], variant["num_processes"], 1, variant["log_dir"], ptu.device, False, pytorch=False, ) obs_shape = expl_envs.observation_space.image.shape # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # convert WxHxC into CxWxH # expl_env = TransposeImage(expl_env, op=[2, 0, 1]) # eval_env = TransposeImage(eval_env, op=[2, 0, 1]) # obs_shape = expl_env.observation_space.shape channels, obs_width, obs_height = obs_shape action_space = expl_envs.action_space base_kwargs = {"num_inputs": channels, "recurrent": variant["recurrent_policy"]} base = CNNBase(**base_kwargs) dist = create_output_distribution(action_space, base.output_size) eval_policy = LearnPlanPolicy( WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=True, dist=dist, num_processes=variant["num_processes"], ), num_processes=variant["num_processes"], vectorised=True, ) expl_policy = LearnPlanPolicy( WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=False, dist=dist, num_processes=variant["num_processes"], ), num_processes=variant["num_processes"], vectorised=True, ) # missing: at this stage, policy hasn't been sent to device, but happens later eval_path_collector = HierarchicalStepCollector( eval_envs, eval_policy, ptu.device, max_num_epoch_paths_saved=variant["algorithm_kwargs"][ "num_eval_steps_per_epoch" ], num_processes=variant["num_processes"], render=variant["render"], ) expl_path_collector = HierarchicalStepCollector( expl_envs, expl_policy, ptu.device, max_num_epoch_paths_saved=variant["num_steps"], num_processes=variant["num_processes"], render=variant["render"], ) # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step] trainer = A2CTrainer(actor_critic=expl_policy.learner, **variant["trainer_kwargs"]) # missing: by this point, rollout back in sync. replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs) # added: replay buffer is new algorithm = TorchIkostrikovRLAlgorithm( trainer=trainer, exploration_env=expl_envs, evaluation_env=eval_envs, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"], # batch_size, # max_path_length, # num_epochs, # num_eval_steps_per_epoch, # num_expl_steps_per_train_loop, # num_trains_per_train_loop, # num_train_loops_per_epoch=1, # min_num_steps_before_training=0, ) algorithm.to(ptu.device) # missing: device back in sync algorithm.train()
def simulate_policy(args): # hyper-parameters fov, delta, num_ch = 33, 8, 3 rad = fov // 2 data = torch.load(args.file) color = [240, 128, 128] # load policy & env policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() policy.reset() # does noting # load image img_volume = skio.imread('data/brainbow/training_sample_1.tif', plugin='tifffile') img_volume_copy = np.copy(img_volume) img_volume = img_volume.astype(np.float32) img_volume = (img_volume - 128) / 33 # select specific z slice s_z = 54 img_plane = img_volume[s_z].astype(np.float32) img_plane_copy = img_volume_copy[s_z] img_plane_shape = img_plane.shape ''' # gather random starting point for each color # random starting point should be colored and also not FoV boundary s_point_list = [] # list of starting point unique_colors = np.unique(np.reshape(img_plane, (-1, 3)), axis=0) for color in unique_colors: if np.all(color == (-128. / 33)): continue color_index_list_tmp = np.argwhere(np.all(img_plane == color, axis=2)) #print(color_index_list_tmp) color_index_list = [] for index in color_index_list_tmp: if is_fov_boundary(img_plane_shape, rad, index): color_index_list.append(index) #print(color_index_list) #print(type(color_index_list)) len_color_index_list = len(color_index_list) if len_color_index_list > 0: random_index = np.random.choice(len_color_index_list, 1) random_start_point = color_index_list[random_index[0]] #print(random_start_point) s_point_list.append(random_start_point) # print(s_point_list) ''' coord = np.argwhere(np.any(img_plane_copy > 100, axis=2)) coord = coord[np.all(coord >= rad, axis=1) & (coord[:, 0] < img_plane_shape[0] - rad) & (coord[:, 1] < img_plane_shape[1] - rad)] # remove bo np.random.shuffle(coord) s_point_list = coord # start skeletonization for i, s_point in enumerate(s_point_list): print('Skeletoninzing', i + 1, 'th point:', s_point) # initialization img_plane_tmp = np.copy(img_plane_copy) s_y, s_x = s_point Q = deque([[s_y, s_x]]) V = [[s_y, s_x]] # start skeletonization for some starting point while len(Q) > 0: c_y, c_x = Q.popleft() # current y, x cur_p_t = img_plane[c_y - rad:c_y + rad + 1, c_x - rad:c_x + rad + 1] # current patch top cur_p_l = cv2.rotate(cur_p_t, cv2.ROTATE_90_CLOCKWISE) # current patch left cur_p_r = cv2.rotate( cur_p_t, cv2.ROTATE_90_COUNTERCLOCKWISE) # current patch right cur_p_b = cv2.rotate(cur_p_t, cv2.ROTATE_180) # current patch bottom a_t, _ = policy.get_action(np.moveaxis(cur_p_t, -1, 0).flatten()) # move top a_l, _ = policy.get_action(np.moveaxis(cur_p_l, -1, 0).flatten()) # move left a_r, _ = policy.get_action(np.moveaxis(cur_p_r, -1, 0).flatten()) # move right a_b, _ = policy.get_action(np.moveaxis(cur_p_b, -1, 0).flatten()) # move bottom top = [c_y - delta, c_x] left = [c_y, c_x - delta] right = [c_y, c_x + delta] bottom = [c_y + delta, c_x] if a_t == 1: if top not in V and is_fov_boundary(img_plane_shape, rad, top): img_plane_tmp[c_y - delta:c_y + 1, c_x] = color Q.append(top) V.append(top) if a_l == 1: if left not in V and is_fov_boundary(img_plane_shape, rad, left): img_plane_tmp[c_y, c_x - delta:c_x + 1] = color Q.append(left) V.append(left) if a_r == 1: if right not in V and is_fov_boundary(img_plane_shape, rad, right): img_plane_tmp[c_y, c_x:c_x + delta + 1] = color Q.append(right) V.append(right) if a_b == 1: if bottom not in V and is_fov_boundary(img_plane_shape, rad, bottom): img_plane_tmp[c_y:c_y + delta + 1, c_x] = color Q.append(bottom) V.append(bottom) # plot final result img_plane_tmp[s_y - 1:s_y + 2, s_x - 1:s_x + 2] = [252, 255, 51] # color starting point fig = plt.figure(figsize=(10, 10)) plt.imshow(img_plane_tmp) plt.show() plt.close()
def main( env_name, exp_dir, seed, resume, mode, archi, epochs, reward_scale, hidden_dim, batch_size, learning_rate, n_layers, soft_target_tau, auto_alpha, alpha, frac_goal_replay, horizon, replay_buffer_size, snapshot_mode, snapshot_gap, cpu, ): valid_modes = ["vanilla", "her"] valid_archi = [ "mlp", "cnn", "pointnet", ] if mode not in valid_modes: raise ValueError(f"Unknown mode: {mode}") if archi not in valid_archi: raise ValueError(f"Unknown network archi: {archi}") machine_log_dir = settings.log_dir() exp_dir = os.path.join(machine_log_dir, exp_dir, f"seed{seed}") # multi-gpu and batch size scaling replay_buffer_size = replay_buffer_size num_expl_steps_per_train_loop = 1000 num_eval_steps_per_epoch = 1000 min_num_steps_before_training = 1000 num_trains_per_train_loop = 1000 # learning rate and soft update linear scaling policy_lr = learning_rate qf_lr = learning_rate variant = dict( env_name=env_name, algorithm="sac", version="normal", seed=seed, resume=resume, mode=mode, archi=archi, replay_buffer_kwargs=dict(max_replay_buffer_size=replay_buffer_size,), algorithm_kwargs=dict( batch_size=batch_size, num_epochs=epochs, num_eval_steps_per_epoch=num_eval_steps_per_epoch, num_expl_steps_per_train_loop=num_expl_steps_per_train_loop, num_trains_per_train_loop=num_trains_per_train_loop, min_num_steps_before_training=min_num_steps_before_training, max_path_length=horizon, ), trainer_kwargs=dict( discount=0.99, soft_target_tau=soft_target_tau, target_update_period=1, policy_lr=policy_lr, qf_lr=qf_lr, reward_scale=reward_scale, use_automatic_entropy_tuning=auto_alpha, alpha=alpha, ), qf_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers), policy_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers), log_dir=exp_dir, ) if mode == "her": variant["replay_buffer_kwargs"].update( dict( fraction_goals_rollout_goals=1 - frac_goal_replay, # equal to k = 4 in HER paper fraction_goals_env_goals=0, ) ) set_seed(seed) setup_logger_kwargs = { "exp_prefix": exp_dir, "variant": variant, "log_dir": exp_dir, "snapshot_mode": snapshot_mode, "snapshot_gap": snapshot_gap, } setup_logger(**setup_logger_kwargs) ptu.set_gpu_mode(not cpu, distributed_mode=False) print(f"Start training...") sac(variant)
exp_dir = '{}_kl'.format(exp_dir) variant["KL"] = True else: # use bonus as KL: -\beta * b exp_dir = '{0}_{1:.2g}'.format(exp_dir, args.beta) else: exp_dir = '{}/offline/{}_{}'.format(args.env, timestamp, args.seed) # setup the logger print('experiment dir:logs/{}'.format(exp_dir)) setup_logger(variant=variant, log_dir='logs/{}'.format(exp_dir)) # cuda setup use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: # optionally set the GPU (default=False) ptu.set_gpu_mode(True, gpu_id=args.device_id) print('using gpu:{}'.format(args.device_id)) def map_location(storage, loc): return storage.cuda() else: map_location = 'cpu' ptu.set_gpu_mode(False) # optionally set the GPU (default=False) experiment(variant)
def simulate_policy(args): if args.pause: import ipdb; ipdb.set_trace() data = pickle.load(open(args.file, "rb")) # joblib.load(args.file) if 'policy' in data: policy = data['policy'] elif 'evaluation/policy' in data: policy = data['evaluation/policy'] else: policy = data['evaluation/hard_init/policy'] if 'env' in data: env = data['env'] elif 'evaluation/env' in data: env = data['evaluation/env'] else: env = data['evaluation/hard_init/env'] if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) else: ptu.set_gpu_mode(False) policy.to(ptu.device) if isinstance(env, VAEWrappedEnv): env.mode(args.mode) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() if args.multitaskpause: env.pause_on_goal = True if isinstance(policy, PyTorchModule): policy.train(False) paths = [] import torch def check(net): for name, param in net.named_parameters(): if torch.isnan(param).any(): print(name) qf = data['trainer/qf1'] # import ipdb; ipdb.set_trace() observation_key = data.get('evaluation/observation_key', 'observation') context_keys = data.get('evaluation/context_keys_for_policy', ['context']) context_keys = data.get('evaluation/hard_init/context_keys_for_policy') while True: paths.append(contextual_rollout( env, policy, max_path_length=args.H, render=not args.hide, observation_key=observation_key, context_keys_for_policy=context_keys, # context_keys_for_policy=['state_desired_goal'], )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): for k, v in env.get_diagnostics(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def run_experiment(): # Define agent-specific arguments trainer_kwargs = None if args.agent == "SAC": trainer_kwargs = dict( discount=args.gamma, soft_target_tau=args.soft_target_tau, target_update_period=args.target_update_period, policy_lr=args.policy_lr, qf_lr=args.qf_lr, reward_scale=args.reward_scale, use_automatic_entropy_tuning=(not args.no_auto_entropy_tuning), ) elif args.agent == "TD3": trainer_kwargs = dict( target_policy_noise=args.target_policy_noise, discount=0.99, reward_scale=args.reward_scale, policy_learning_rate=args.policy_lr, qf_learning_rate=args.qf_lr, policy_and_target_update_period=args. policy_and_target_update_period, tau=args.tau, ) else: pass # Set random seed np.random.seed(args.seed) torch.manual_seed(args.seed) # Directory to place data THIS_DIR = os.path.dirname( args.variant) # os.path.dirname(os.path.abspath(__file__)) # Construct variant to train if args.variant is None: variant = dict( algorithm=args.agent, seed=args.seed, version="normal", replay_buffer_size=int(1E6), qf_kwargs=dict(hidden_sizes=args.qf_hidden_sizes, ), policy_kwargs=dict(hidden_sizes=args.policy_hidden_sizes, ), algorithm_kwargs=dict( num_epochs=args.n_epochs, num_eval_steps_per_epoch=args.eval_horizon * args.num_eval, num_trains_per_train_loop=args.trains_per_train_loop, num_expl_steps_per_train_loop=args.expl_horizon * args.expl_ep_per_train_loop, min_num_steps_before_training=args.steps_before_training, expl_max_path_length=args.expl_horizon, eval_max_path_length=args.eval_horizon, batch_size=args.batch_size, ), trainer_kwargs=trainer_kwargs, expl_environment_kwargs=get_expl_env_kwargs(args), eval_environment_kwargs=get_eval_env_kwargs(args), ) # Set logging tmp_file_prefix = "{}_{}_{}_SEED{}".format(args.env, "".join(args.robots), args.controller, args.seed) else: # This is a variant we want to load # Attempt to load the json file try: with open(args.variant) as f: variant = json.load(f) except FileNotFoundError: print("Error opening specified variant json at: {}. " "Please check filepath and try again.".format(variant)) # Set logging tmp_file_prefix = "{}_{}_{}_SEED{}".format( variant["expl_environment_kwargs"]["env_name"], "".join(variant["expl_environment_kwargs"]["robots"]), variant["expl_environment_kwargs"]["controller"], args.seed) # Set agent args.agent = variant["algorithm"] # Setup logger abs_root_dir = os.path.join(THIS_DIR, args.log_dir) tmp_dir = setup_logger(tmp_file_prefix, variant=variant, base_log_dir=abs_root_dir) ptu.set_gpu_mode( torch.cuda.is_available()) # optionally set the GPU (default=False # Run experiment experiment(variant, agent=args.agent)
def run_policy(file, eval_env, goal_env=False, use_color=True, cherrypick=False, fixed_length=False, verbose=False, render_kwargs=dict(height=128, width=128, camera_id=0)): ptu.set_gpu_mode(True, 0) with open(file, 'rb') as f: params = pickle.load(f) if goal_env: obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size else: obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy = params['exploration/policy'] # .to(ptu.device) policy = policy.eval() policy = MakeDeterministic(policy) if goal_env: r = [-1] step = 0 while 0 not in r or sum(r) == 0: step += 1 start = time.time() if goal_env: path = multitask_rollout_visualizer( eval_env, agent=policy, max_path_length=eval_env.max_steps, render=True, render_kwargs=render_kwargs, observation_key='observation', desired_goal_key='desired_goal', get_action_kwargs=None, return_dict_obs=True, use_color=use_color, fixed_length=fixed_length) r = path["rewards"] else: path = rollout_visualizer(eval_env, agent=policy, max_path_length=eval_env.max_steps, render=True, render_kwargs=render_kwargs, use_color=use_color) r = path["rewards"] if verbose: print(step, len(r), sum(r), end='\r') if not cherrypick: break return path, eval_env
def simulate_policy(args): # import torch # torch.manual_seed(6199) if args.pause: import ipdb ipdb.set_trace() data = pickle.load(open(args.file, "rb")) policy = data['algorithm'].policy num_blocks = 6 stack_only = True # env = data['env'] env = gym.make( F"FetchBlockConstruction_{num_blocks}Blocks_IncrementalReward_DictstateObs_42Rendersize_{stack_only}Stackonly_AllCase-v1" ) env = Monitor(env, force=True, directory="videos/", video_callable=lambda x: x) print("Policy and environment loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() policy.train(False) failures = [] successes = [] for path_idx in range(100): path = multitask_rollout( env, policy, max_path_length=num_blocks * 50, animated=not args.hide, observation_key='observation', desired_goal_key='desired_goal', get_action_kwargs=dict(mask=np.ones((1, num_blocks)), deterministic=True), ) if not is_solved(path, num_blocks): failures.append(path) print(F"Failed {path_idx}") else: print(F"Succeeded {path_idx}") successes.append(path) # if hasattr(env, "log_diagnostics"): # env.log_diagnostics(paths) # if hasattr(env, "get_diagnostics"): # for k, v in env.get_diagnostics(paths).items(): # logger.record_tabular(k, v) # logger.dump_tabular() print(f"Success rate {len(successes)/(len(successes) + len(failures))}") from rlkit.core.eval_util import get_generic_path_information path_info = get_generic_path_information(successes + failures, num_blocks=num_blocks) print(path_info)
import numpy as np from gym.envs.mujoco import HalfCheetahEnv import rlkit.torch.pytorch_util as ptu from rlkit.envs.wrappers import NormalizedBoxEnv from rlkit.launchers.launcher_util import setup_logger from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.sac.sac import SoftActorCritic from rlkit.torch.networks import FlattenMlp import rlkit.torch.pytorch_util as U from rlkit.envs.mujoco_manip_env import MujocoManipEnv # Sets the GPU mode. USE_GPU = True U.set_gpu_mode(USE_GPU) EXPERIMENT_NAME = "cans-50-50-reward-scale-1" #EXPERIMENT_NAME = "pegs-50-50-reward-scale-0.1" #EXPERIMENT_NAME = "lift-lr-1e-4" HORIZON = 250 UPDATES_PER_STEP = 1 REWARD_SCALE = 1 # DEMO_PATH = None DEMO_PATH = "/home/robot/Downloads/test_extraction/bins-Can0-sars.pkl" MIX_DEMO = True ACTION_SKIP = 1 LR = 3E-4
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) img_save_path = 'junk_vis/debug_more_proper' # Prep the data ----------------------------------------------------------- data_path = 'junk_vis/multi_mnist_data' canvas_size = 36 (X_train, _), (X_test, _) = multi_mnist(data_path, max_digits=1, canvas_size=canvas_size, seed=42, use_max=True) X_train = X_train[:, None, ...] X_test = X_test[:, None, ...] X_train, X_test = torch.FloatTensor(X_train) / 255.0, torch.FloatTensor( X_test) / 255.0 # np_imgs = np.load('/u/kamyar/dsprites-dataset/dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz')['imgs'] # np_imgs = None X_train = torch.clamp(X_train, 0.05, 0.95) X_test = torch.clamp(X_test, 0.05, 0.95) train_ds = TensorDataset(X_train) val_ds = TensorDataset(X_test) # Model Definition -------------------------------------------------------- if exp_specs['masked']: model = MaskedVAE( [1, canvas_size, canvas_size], exp_specs['vae_specs']['z_dim'], exp_specs['vae_specs']['encoder_specs'], exp_specs['vae_specs']['decoder_specs'], ) else: model = VAE( [1, canvas_size, canvas_size], exp_specs['vae_specs']['z_dim'], exp_specs['vae_specs']['encoder_specs'], exp_specs['vae_specs']['decoder_specs'], ) if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- global_iter = 0 for epoch in range(exp_specs['epochs']): train_loader = DataLoader(train_ds, batch_size=exp_specs['batch_size'], shuffle=True, num_workers=4, pin_memory=True, drop_last=True) for iter_num, img_batch in enumerate(train_loader): img_batch = img_batch[0] if ptu.gpu_enabled(): img_batch = img_batch.cuda() z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model( img_batch) elbo, KL = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov, img_batch, average_over_batch=True) loss = -1. * elbo loss.backward() model_optim.step() if global_iter % 1000 == 0: mse = ((recon_mean - img_batch)**2).mean() print('\nTraining Iter %d...' % global_iter) print('ELBO:\t%.4f' % elbo) print('MSE:\t%.4f' % mse) print('KL:\t%.4f' % KL) save_pytorch_tensor_as_img( img_batch[0].data.cpu(), os.path.join(img_save_path, '%d_train_img.png' % (global_iter))) save_pytorch_tensor_as_img( recon_mean[0].data.cpu(), os.path.join(img_save_path, '%d_train_recon.png' % (global_iter))) if exp_specs['masked']: save_pytorch_tensor_as_img( enc_mask[0].data.cpu(), os.path.join(img_save_path, '%d_train_enc_mask.png' % (global_iter))) # save_pytorch_tensor_as_img(dec_mask[0].data.cpu(), os.path.join(img_save_path, '%d_train_dec_mask.png'%(global_iter))) if global_iter % exp_specs['freq_val'] == 0: with torch.no_grad(): print('Validating Iter %d...' % global_iter) model.eval() idxs = np.random.choice(int(X_test.size(0)), size=exp_specs['batch_size'], replace=False) img_batch = X_test[idxs] if ptu.gpu_enabled(): img_batch = img_batch.cuda() z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model( img_batch) elbo, KL = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov, img_batch, average_over_batch=True) mse = ((recon_mean - img_batch)**2).mean() print('ELBO:\t%.4f' % elbo) print('MSE:\t%.4f' % mse) print('KL:\t%.4f' % KL) for i in range(1): save_pytorch_tensor_as_img( img_batch[i].data.cpu(), os.path.join(img_save_path, '%d_%d_img.png' % (global_iter, i))) save_pytorch_tensor_as_img( recon_mean[i].data.cpu(), os.path.join(img_save_path, '%d_%d_recon.png' % (global_iter, i))) if exp_specs['masked']: save_pytorch_tensor_as_img( enc_mask[i].data.cpu(), os.path.join( img_save_path, '%d_%d_enc_mask.png' % (global_iter, i))) # save_pytorch_tensor_as_img(dec_mask[i].data.cpu(), os.path.join(img_save_path, '%d_%d_dec_mask.png'%(global_iter, i))) model.train() global_iter += 1
discount=0.99, soft_target_tau=5e-3, target_update_period=1, policy_lr=3E-4, qf_lr=3E-4, reward_scale=1, use_automatic_entropy_tuning=True, ), replay_buffer_kwargs=dict( max_size=int(1E6), fraction_goals_rollout_goals=.2, fraction_goals_env_goals=0, ), qf_kwargs=dict( hidden_sizes=[400, 300], ), policy_kwargs=dict( hidden_sizes=[400, 300], ), ) def get_name(v): name = '_'.join([v['env_name'], v['algorithm'], v['title']]) return name if variant['save']: name = get_name(variant) setup_logger(name, variant=variant) # optionally set the GPU (default=False) ptu.set_gpu_mode(True, gpu_id=0) experiment(variant)
def offpolicy_inference(seed, env_name, det, load_name, evaluation, render, knob_noisy, visionnet_input, env_kwargs, actor_critic=None, verbose=True, pos_control=True, step_skip=4): import time from gym import wrappers print("evaluatin started!") filename = str(uuid.uuid4()) gpu = True env, _, _ = prepare_env(env_name, **env_kwargs) if not actor_critic: snapshot = torch.load(load_name) policy = snapshot['evaluation/policy'] else: policy = actor_critic if env_name.find('doorenv') > -1: policy.knob_noisy = knob_noisy policy.nn = env._wrapped_env.nn policy.visionnet_input = env_kwargs['visionnet_input'] epi_counter = 1 dooropen_counter = 0 total_time = 0 test_num = 100 start_time = int(time.mktime(time.localtime())) if gpu: set_gpu_mode(True) while True: # print("new env") if env_name.find('doorenv') > -1: if evaluation: path, door_opened, opening_time = rollout( env, policy, max_path_length=512, render=render, evaluate=evaluation, verbose=True, doorenv=True, pos_control=pos_control, step_skip=step_skip, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() # if evaluation: # print("1") env, _, _ = prepare_env(env_name, **env_kwargs) if door_opened: dooropen_counter += 1 total_time += opening_time if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format( epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) else: path = rollout( env, policy, max_path_length=512, render=render, evaluate=evaluation, verbose=True, doorenv=True, pos_control=pos_control, step_skip=step_skip, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() else: path = rollout( env, policy, max_path_length=512, doorenv=False, render=render, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) epi_counter += 1 if env_name.find('door') > -1 and epi_counter > test_num: if verbose: print("dooropening counter:", dooropen_counter, " epi counter:", epi_counter) eval_print(dooropen_counter, epi_counter, start_time, total_time) break opening_rate, opening_timeavg = eval_print(dooropen_counter, epi_counter - 1, start_time, total_time) return opening_rate, opening_timeavg
def simulate_policy(args): data = joblib.load(args.file) if 'eval_policy' in data: policy = data['eval_policy'] elif 'policy' in data: policy = data['policy'] elif 'exploration_policy' in data: policy = data['exploration_policy'] else: raise Exception("No policy found in loaded dict. Keys: {}".format( data.keys())) max_tau = get_max_tau(args) env = data['env'] env.mode("video_env") env.decode_goals = True if hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() if args.gpu: set_gpu_mode(True) policy.to(ptu.device) if hasattr(env, "vae"): env.vae.to(ptu.device) else: # make sure everything is on the CPU set_gpu_mode(False) policy.cpu() if hasattr(env, "vae"): env.vae.cpu() if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) ROWS = 3 COLUMNS = 6 dirname = osp.dirname(args.file) input_file_name = os.path.splitext(os.path.basename(args.file))[0] filename = osp.join(dirname, "video_{}.mp4".format(input_file_name)) rollout_function = create_rollout_function( tdm_rollout, init_tau=max_tau, observation_key='observation', desired_goal_key='desired_goal', ) paths = dump_video( env, policy, filename, rollout_function, ROWS=ROWS, COLUMNS=COLUMNS, horizon=args.H, dirname_to_save_images=dirname, subdirname="rollouts_" + input_file_name, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) logger.dump_tabular()
), qf_kwargs=dict( hidden_sizes=[400, 300], ), policy_kwargs=dict( hidden_sizes=[400, 300], ), save_video=True, dump_video_kwargs=dict( save_period=1, # imsize=(3, 500, 300), ) ) ptu.set_gpu_mode("gpu") representation_size = 128 output_classes = 20 model_class = variant.get('model_class', TimestepPredictionModel) model = model_class( representation_size, # decoder_output_activation=decoder_activation, output_classes=output_classes, **variant['model_kwargs'], ) # model = torch.nn.DataParallel(model) imagenets = [True, False] reg_types = ["regression_distance", "latent_distance"]
def experiment(variant): args.grayscale = variant['grayscale'] def make_my_env(args, rank): def thunk(): _env = grounding_env.GroundingEnv(args, args.seed + rank, img_encoder=None, fixed=False, manual_set_task=True, n_stack=variant['n_stack']) _env.game_init() _env.tasks = _env.sample_tasks(variant['task_params']['n_tasks'], variants=variant['all_tasks']) return _env return thunk task_params = variant['task_params'] # env = NormalizedBoxEnv(AntGoalEnv(n_tasks=task_params['n_tasks'], use_low_gear_ratio=task_params['low_gear'])) env = make_my_env(args, 0)() # import time # def make_envs(): # t0 = time.time() # envs = SubprocVecEnv([make_my_env(args, i) for i in range(10)]) # print('TIMING', time.time() - t0) # import pdb; pdb.set_trace() ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id']) tasks = env.get_all_task_idx() pix_dim = int(np.prod(env.observation_space.shape)) obs_dim = variant['algo_params']['obs_emb_dim'] action_dim = env.action_space.n # int(np.prod(env.action_space.shape)) latent_dim = 5 task_enc_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] # start with linear task encoding recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder nchan = 1 if variant['grayscale'] else 3 n_layers = variant['n_layers'] cnn_enc = CNNEncoder( 64, 64, nchan * variant['n_stack'], obs_dim, [8, 4, 3, 3], #kernels [256, 64, 64, 64], #channels [2, 2, 2, 2], # strides [1, 1, 1, 1], # padding # hidden_sizes=[256], added_fc_input_size=0, batch_norm_conv=False, batch_norm_fc=False, init_w=1e-4, # hidden_init=nn.init.xavier_uniform_, # hidden_activation=nn.ReLU(), # output_activation=identity, ) task_enc = encoder_model( hidden_sizes=[200] * n_layers, # deeper net + higher dim space generalize better input_size=obs_dim + action_dim + reward_dim, output_size=task_enc_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size] * n_layers, input_size=obs_dim + latent_dim, output_size=action_dim, ) qf2 = FlattenMlp( hidden_sizes=[net_size] * n_layers, input_size=obs_dim + latent_dim, output_size=action_dim, ) vf = FlattenMlp( hidden_sizes=[net_size] * n_layers, #, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size] * n_layers, # net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = ProtoAgent(latent_dim, [task_enc, cnn_enc, policy, qf1, qf2, vf], **variant['algo_params']) n_eval_tasks = int(variant['task_params']['n_tasks'] * 0.3) algorithm = ProtoSoftActorCritic( env=env, train_tasks=list(tasks[:-n_eval_tasks]), eval_tasks=list(tasks[-n_eval_tasks:]), nets=[agent, task_enc, policy, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.to() algorithm.train()
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) #low Qs first and then high Qs q_list = [[ FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=2 * obs_dim + action_dim, output_size=1, ), FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=2 * obs_dim + action_dim, output_size=1, ) ], [ FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ), FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) ]] #low vf first and then high vf vf_list = [ FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=2 * obs_dim, output_size=1, ), FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) ] #NOTE: Reduced number of hidden layers in h_policy from 3 to 2 (idea being it's not doing as much as the whole policy in PEARL) h_policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=obs_dim, ) #NOTE: Kept the 3 layers because f**k it it'll get tons of data l_policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size, net_size], obs_dim=2 * obs_dim, latent_dim=0, action_dim=action_dim, ) #TODO Implement BernAgent agent = BURNAgent(latent_dim, context_encoder, h_policy, l_policy, c=2, **variant['algo_params']) algorithm = BURNSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, q_list, vf_list], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights #TODO Make sure weights are properly saved if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) q_list[0][0].load_state_dict( torch.load(os.path.join(path, 'l_qf1.pth'))) q_list[0][1].load_state_dict( torch.load(os.path.join(path, 'l_qf2.pth'))) q_list[1][0].load_state_dict( torch.load(os.path.join(path, 'h_qf1.pth'))) q_list[1][1].load_state_dict( torch.load(os.path.join(path, 'h_qf2.pth'))) vf_list[0].load_state_dict(torch.load(os.path.join(path, 'l_vf.pth'))) vf_list[1].load_state_dict(torch.load(os.path.join(path, 'h_vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) h_policy.load_state_dict(torch.load(os.path.join(path, 'h_policy.pth'))) l_policy.load_state_dict(torch.load(os.path.join(path, 'l_policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def visualize_policy(args): variant_overwrite = dict( params_pkl=args.params_pkl, num_historical_policies=args.num_historical_policies, env_kwargs=dict( reward_type='indicator', sample_goal=False, shape_rewards=False, distance_threshold=0.1, terminate_upon_success=False, terminate_upon_failure=False, )) if args.logdir == '': variant = variant_overwrite env = NormalizedBoxEnv( ManipulationEnv(**variant_overwrite['env_kwargs'])) eval_policy = RandomPolicy(env.action_space) else: env, _, data, variant = load_experiment(args.logdir, variant_overwrite) eval_policy = data[ 'eval_policy'] if args.use_deterministic_policy else data['policy'] if not args.cpu: set_gpu_mode(True) eval_policy.cuda() print("Loaded policy:", eval_policy) if 'smm_kwargs' in variant: # Iterate through each latent-conditioned policy. num_skills = variant['smm_kwargs']['num_skills'] print('Running SMM policy with {} skills.'.format(num_skills)) import rlkit.torch.smm.utils as utils class PartialPolicy: def __init__(polself, policy): polself._policy = policy polself._num_skills = num_skills polself._z = -1 polself.reset() def get_action(polself, ob): aug_ob = utils.concat_ob_z(ob, polself._z, polself._num_skills) return polself._policy.get_action(aug_ob) def sample_skill(polself): z = np.random.choice(polself._num_skills) return z def reset(polself): polself._z = (polself._z + 1) % polself._num_skills print("Using skill z:", polself._z) return polself._policy.reset() eval_policy = PartialPolicy(eval_policy) paths = [] for _ in range(args.num_episodes): eval_policy.reset() path = rollout( env, eval_policy, max_path_length=args.max_path_length, animated=(not args.norender), ) paths.append(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): diagnostics = env.get_diagnostics(paths) for key, val in diagnostics.items(): logger.record_tabular(key, val) logger.dump_tabular(with_prefix=False, with_timestamp=False) if hasattr(env, "draw"): env.draw(paths, save_dir="")
def experiment(variant, seed=None): # create multi-task environment and sample tasks, normalize obs if provided with 'normalizer.npz' if 'normalizer.npz' in os.listdir(variant['algo_params']['data_dir']): obs_absmax = np.load(os.path.join(variant['algo_params']['data_dir'], 'normalizer.npz'))['abs_max'] env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']), obs_absmax=obs_absmax) else: env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) if seed is not None: global_seed(seed) env.seed(seed) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant['algo_params']['use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, output_activation=torch.tanh, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent( latent_dim, context_encoder, policy, **variant['algo_params'] ) if variant['algo_type'] == 'FOCAL': # critic network for divergence in dual form (see BRAC paper https://arxiv.org/abs/1911.11361) c = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1 ) if 'randomize_tasks' in variant.keys() and variant['randomize_tasks']: rng = default_rng() train_tasks = rng.choice(len(tasks), size=variant['n_train_tasks'], replace=False) eval_tasks = set(range(len(tasks))).difference(train_tasks) if 'goal_radius' in variant['env_params']: algorithm = FOCALSoftActorCritic( env=env, train_tasks=train_tasks, eval_tasks=eval_tasks, nets=[agent, qf1, qf2, vf, c], latent_dim=latent_dim, goal_radius=variant['env_params']['goal_radius'], **variant['algo_params'] ) else: algorithm = FOCALSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf, c], latent_dim=latent_dim, **variant['algo_params'] ) else: if 'goal_radius' in variant['env_params']: algorithm = FOCALSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf, c], latent_dim=latent_dim, goal_radius=variant['env_params']['goal_radius'], **variant['algo_params'] ) else: algorithm = FOCALSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf, c], latent_dim=latent_dim, **variant['algo_params'] ) else: NotImplemented # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir'], seed=seed, snapshot_mode="all" ) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
version="normal", env_name=args.env, layer_size=256, replay_buffer_size=int(1E6), algorithm_kwargs=dict( num_epochs=3000, num_eval_steps_per_epoch=5000, num_trains_per_train_loop=1000, num_expl_steps_per_train_loop=1000, min_num_steps_before_training=1000, max_path_length=1000, batch_size=256, ), trainer_kwargs=dict( discount=0.99, soft_target_tau=5e-3, target_update_period=1, policy_lr=3E-4, qf_lr=3E-4, reward_scale=1, use_automatic_entropy_tuning=True, ), ) exp_dir = '{}'.format(args.env) print('experiment dir:logs/{}'.format(exp_dir)) setup_logger(variant=variant, log_dir='logs/{}'.format(exp_dir)) ptu.set_gpu_mode(True, gpu_id=args.device_id) print('using gpu:{}'.format(args.device_id)) # ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant)
def setup_and_run(variant): ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['seed'] % variant['util_params']['num_gpus']) #setup env env_name = variant['env_name'] env_params = variant['env_params'] env_params['n_tasks'] = variant["n_train_tasks"] + variant["n_eval_tasks"] env = NormalizedBoxEnv(ENVS[env_name](**env_params)) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = variant['latent_size'] reward_dim = 1 #setup encoder context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) #setup actor, critic qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic( env=env, train_tasks=list(np.arange(variant['n_train_tasks'])), eval_tasks=list( np.arange(variant['n_train_tasks'], variant['n_train_tasks'] + variant['n_eval_tasks'])), nets=[agent, qf1, qf2, target_qf1, target_qf2], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) target_qf1.load_state_dict( torch.load(os.path.join(path, 'target_qf1.pth'))) target_qf2.load_state_dict( torch.load(os.path.join(path, 'target_qf2.pth'))) # TODO hacky, revisit after model refactor policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) if ptu.gpu_enabled(): algorithm.to() os.environ['DEBUG'] = str(int(variant['util_params']['debug'])) #setup logger run_mode = variant['run_mode'] exp_log_name = os.path.join( variant['env_name'], run_mode, variant['log_annotation'] + variant['variant_name'], 'seed-' + str(variant['seed'])) setup_logger(exp_log_name, variant=variant, exp_id=None, base_log_dir=os.environ.get('PEARL_DATA_PATH'), snapshot_mode='gap', snapshot_gap=10) # run the algorithm if run_mode == 'TRAIN': algorithm.train() elif run_mode == 'EVAL': assert variant['algo_params']['dump_eval_paths'] == True algorithm._try_to_eval() else: algorithm.eval_with_loaded_latent()
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def experiment(variant): task_params = variant['task_params'] env = NormalizedBoxEnv( AntGoalEnv(n_tasks=task_params['n_tasks'], use_low_gear_ratio=task_params['low_gear'])) ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id']) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = 5 task_enc_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] # start with linear task encoding recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder task_enc = encoder_model( hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better input_size=obs_dim + action_dim + reward_dim, output_size=task_enc_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = ProtoAgent(latent_dim, [task_enc, policy, qf1, qf2, vf], **variant['algo_params']) algorithm = ProtoSoftActorCritic( env=env, train_tasks=list(tasks[:-30]), eval_tasks=list(tasks[-30:]), nets=[agent, task_enc, policy, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.to() algorithm.train()
disc_kwargs=dict( batch_size=256, num_batches_per_fit=1, num_skills=args.num_skills, sampling_strategy=sampling_strategy, sampling_window=10, ), env_kwargs=dict( reward_params=dict(type=algo), unsupervised_reward_weight=args.unsupervised_reward_weight, reward_weight=args.environment_reward_weight), net_size=300, experiment=args.algo, ) ptu.set_gpu_mode(True, 0) # optionally set the GPU (default=False) if algo == 'wrapped_env': setup_logger( 'CAMERA_READY_EXPERIMENTS/{}/env_weight_{}/seed{}/replay_buffer_size_{}/num_skills_{}/target_entropy_multiplier_{}/action_noise_{}' .format(args.env, args.environment_reward_weight, args.seed, args.replay_buffer_size, args.num_skills, args.target_entropy_multiplier, args.noise_scale), variant=variant) elif algo == 'diayn': setup_logger( 'CAMERA_READY_EXPERIMENTS/{}/unsupervised_weight_{}/seed{}/replay_buffer_size_{}/num_skills_{}/target_entropy_multiplier_{}/action_noise_{}' .format(args.env, args.unsupervised_reward_weight, args.seed, args.replay_buffer_size, args.num_skills, args.target_entropy_multiplier, args.noise_scale), variant=variant) else:
labels[num_context_points:]).type( torch.FloatTensor).mean() print('Meta-Test Loss: %.4f' % loss) print('Meta-Test Acc Ctxt: %.4f' % context_accuracy) print('Meta-Test Acc Test: %.4f' % test_accuracy) model.train() return 1 if __name__ == '__main__': # Arguments parser = argparse.ArgumentParser() parser.add_argument('-e', '--experiment', help='experiment specification file') args = parser.parse_args() with open(args.experiment, 'r') as spec_file: spec_string = spec_file.read() exp_specs = yaml.load(spec_string) exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) if exp_specs['use_gpu']: ptu.set_gpu_mode(True) experiment(exp_specs)
), policy_kwargs=dict( hidden_dim=args.hidden, num_layer=args.layer, ), replay_buffer_size=int(1E6), ) import os if not os.path.isdir(log_dir): os.makedirs(log_dir) with open(osp.join(log_dir, 'variant.json'), 'w') as out_json: import json json.dump(variant, out_json, indent=2) import sys cmd_input = 'python ' + ' '.join(sys.argv) + '\n' with open(osp.join(log_dir, 'cmd_input.txt'), 'a') as f: f.write(cmd_input) setup_logger(args.exp_name + '/' + main_dir, variant=variant, snapshot_mode=args.snapshot_mode, snapshot_gap=args.snapshot_gap, log_dir=log_dir) import numpy as np import torch np.random.seed(args.seed) torch.manual_seed(args.seed) if isinstance(args.gpu, int): print('using gpu ', args.gpu) ptu.set_gpu_mode(True, gpu_id=args.gpu) experiment(variant)
from agent.agent import Agent # TODO better naming here from rlkit.torch.sac.sac import SACTrainer from rlkit.torch.networks import FlattenMlp from rlkit.torch.sac.policies import TanhGaussianPolicy import torch import rlkit.torch.pytorch_util as torch_util from agent.mem import Mem import os import pickle import config as run_config if torch.cuda.is_available(): torch_util.set_gpu_mode(True) log = run_config.log() class SAC(Agent): def __init__(self, env, eval_env, mem, nets, train_step_params): super().__init__(env, eval_env, mem, nets, train_step_params) self._mem = mem self._env = env self._eval_env = eval_env self._policy_net, self._q1_net, self._q2_net, self._target_q1_net,\ self._target_q2_net = nets['policy_net'], nets['q1_net'], nets['q2_net'],\ nets['target_q1_net'], nets['target_q2_net'] self._train_step_params = train_step_params