def simulate_policy(args): #data = torch.load(args.file) variant, data = doc.load_rklit_file(args.session_name) if args.mode == 'eval': policy = data['evaluation/policy'] elif args.mode == 'expl': policy = data['exploration/policy'] else: policy = None #env = data['evaluation/env'] environment = stuff.NormalizedActions( env.DeepBuilderEnv(args.session_name, 6, 7, 20, 12)) environment.env.is_simulation = args.simulation == 1 print("Policy loaded") set_gpu_mode(True) policy.cuda() while True: path = rollout( environment, policy, #max_path_length=args.H, render=False, ) if hasattr(env, "log_diagnostics"): environment.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): #policy = data['evaluation/policy'] #env = data['evaluation/env'] environment = stuff.NormalizedActions( env.DeepBuilderEnv("replay", 6, 7, 20, 12)) environment.env.is_simulation = args.simulation == 1 environment.reset() trajectory = good_path.td3 for i in range(int(args.start_at), len(trajectory)): action = trajectory[i] actiont = torch.FloatTensor(action) try: environment.step(actiont) except ValueError as err: if err.args[0] == "GH_OUT": print( "Lost connection to GH, will need to play this episode again" ) else: raise
def experiment(args, variant): #eval_env = gym.make('FetchReach-v1') #expl_env = gym.make('FetchReach-v1') core_env = env.DeepBuilderEnv(args.session_name, args.act_dim, args.box_dim, args.max_num_boxes, args.height_field_dim) eval_env = stuff.NormalizedActions(core_env) expl_env = stuff.NormalizedActions(core_env) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size resumed = args.resume == 1 if resumed: variant, params = doc.load_rklit_file(args.session_name) variant['algorithm_kwargs']['min_num_steps_before_training'] = 0 M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/qf1'] qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/qf2'] target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/target_qf1'] target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/target_qf2'] policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) if not resumed else params['trainer/policy'] eval_policy = MakeDeterministic( policy) if not resumed else params['evaluation/policy'] eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer_expl = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) replay_buffer_eval = EnvReplayBuffer( int(variant['replay_buffer_size'] * (float(args.num_plays_eval) / float(args.num_plays_expl))), eval_env, ) if resumed: replay_buffer_expl._actions = params['replay_buffer_expl/actions'] replay_buffer_expl._env_infos = params['replay_buffer_expl/env_infos'] replay_buffer_expl._next_obs = params['replay_buffer_expl/next_obs'] replay_buffer_expl._observations = params[ 'replay_buffer_expl/observations'] replay_buffer_expl._rewards = params['replay_buffer_expl/rewards'] replay_buffer_expl._size = params['replay_buffer_expl/size'] replay_buffer_expl._terminals = params['replay_buffer_expl/terminals'] replay_buffer_expl._top = params['replay_buffer_expl/top'] replay_buffer_eval._actions = params['replay_buffer_eval/actions'] replay_buffer_eval._env_infos = params['replay_buffer_eval/env_infos'] replay_buffer_eval._next_obs = params['replay_buffer_eval/next_obs'] replay_buffer_eval._observations = params[ 'replay_buffer_eval/observations'] replay_buffer_eval._rewards = params['replay_buffer_eval/rewards'] replay_buffer_eval._size = params['replay_buffer_eval/size'] replay_buffer_eval._terminals = params['replay_buffer_eval/terminals'] replay_buffer_eval._top = params['replay_buffer_eval/top'] elif args.replay_add_sess_name != '': _, other_params = doc.load_rklit_file(args.replay_add_sess_name) num_samples = int(args.replay_add_num_samples) replay_buffer_expl._size = 0 replay_buffer_expl._top = 0 print("Loading " + str(num_samples) + " batch samples from session " + args.replay_add_sess_name) zeroes = [] offset = 0 for i in range(num_samples): act = other_params['replay_buffer_expl/actions'][i] obs = other_params['replay_buffer_expl/observations'][i] if act.min() == 0.0 and act.max() == 0.0 and obs.min( ) == 0.0 and obs.max() == 0.0: zeroes.append(i) continue replay_buffer_expl._actions[offset] = copy.deepcopy(act.tolist()) replay_buffer_expl._next_obs[offset] = copy.deepcopy( other_params['replay_buffer_expl/next_obs'][i].tolist()) replay_buffer_expl._observations[offset] = copy.deepcopy( obs.tolist()) replay_buffer_expl._rewards[offset] = copy.deepcopy( other_params['replay_buffer_expl/rewards'][i].tolist()) replay_buffer_expl._terminals[offset] = copy.deepcopy( other_params['replay_buffer_expl/terminals'][i].tolist()) replay_buffer_expl._size += 1 replay_buffer_expl._top += 1 offset += 1 print( "Detected and ignored " + str(len(zeroes)) + " zero samples in replay buffer. Total num samples loaded into replay buffer: " + str(replay_buffer_expl._size)) other_params = {} trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'], starting_train_steps=0 if not resumed else (params['replay_buffer_expl/top'] * variant['algorithm_kwargs']['num_trains_per_train_loop']), ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer_eval=replay_buffer_eval, replay_buffer_expl=replay_buffer_expl, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(args, variant): core_env = env.DeepBuilderEnv(args.session_name, args.act_dim, args.box_dim, args.max_num_boxes, args.height_field_dim) eval_env = stuff.NormalizedActions(core_env) expl_env = stuff.NormalizedActions(core_env) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( 222726+21, expl_env, ) replay_buffer_eval = EnvReplayBuffer(21, eval_env) if args.replay_add_sess_name_1 != '': _, other_params = doc.load_rklit_file(args.replay_add_sess_name_1) num_samples = int(args.replay_add_num_samples_1) replay_buffer._size = 0 replay_buffer._top = 0 offset = 0 print("Loading "+str(num_samples)+" batch samples from session " + args.replay_add_sess_name_1) for i in range(num_samples): act = other_params['replay_buffer_expl/actions'][i] obs = other_params['replay_buffer_expl/observations'][i] if not (act.min()== 0.0 and act.max() == 0.0 and obs.min() == 0.0 and obs.max() == 0.0): replay_buffer._actions[i] = act replay_buffer._next_obs[i] = other_params['replay_buffer_expl/next_obs'][i] replay_buffer._observations[i] = obs replay_buffer._rewards[i] = other_params['replay_buffer_expl/rewards'][i] replay_buffer._terminals[i] = other_params['replay_buffer_expl/terminals'][i] replay_buffer._size += 1 replay_buffer._top += 1 offset+=1 if args.replay_add_sess_name_2 != '': _, other_params = doc.load_rklit_file(args.replay_add_sess_name_2) num_samples = int(args.replay_add_num_samples_2) print("Loading "+str(num_samples)+" batch samples from session " + args.replay_add_sess_name_2) for i in range(21021, num_samples): act = other_params['replay_buffer_expl/actions'][i] obs = other_params['replay_buffer_expl/observations'][i] if not (act.min()== 0.0 and act.max() == 0.0 and obs.min() == 0.0 and obs.max() == 0.0): replay_buffer._actions[offset] = act replay_buffer._next_obs[offset] = other_params['replay_buffer_expl/next_obs'][i] replay_buffer._observations[offset] = obs replay_buffer._rewards[offset] = other_params['replay_buffer_expl/rewards'][i] replay_buffer._terminals[offset] = other_params['replay_buffer_expl/terminals'][i] replay_buffer._size += 1 replay_buffer._top += 1 offset+=1 ''' if args.replay_add_sess_name_3 != args.replay_add_sess_name_2: #_, other_params = doc.load_rklit_file(args.replay_add_sess_name_3) num_samples = int(args.replay_add_num_samples_3) print("Loading "+str(num_samples)+" batch samples from session " + args.replay_add_sess_name_3) for i in range(num_samples): act = other_params['replay_buffer_eval/actions'][i] obs = other_params['replay_buffer_eval/observations'][i] if not (act.min()== 0.0 and act.max() == 0.0 and obs.min() == 0.0 and obs.max() == 0.0): replay_buffer._actions[offset] = act replay_buffer._next_obs[offset] = other_params['replay_buffer_eval/next_obs'][i] replay_buffer._observations[offset] = obs replay_buffer._rewards[offset] = other_params['replay_buffer_eval/rewards'][i] replay_buffer._terminals[offset] = other_params['replay_buffer_eval/terminals'][i] replay_buffer._size += 1 replay_buffer._top += 1 offset+=1 ''' del other_params print("Detected and removed "+str(replay_buffer._max_replay_buffer_size - replay_buffer._size)+" zero samples. Final size of replay buffer: " + str(replay_buffer._size)) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer_expl=replay_buffer, replay_buffer_eval=replay_buffer_eval, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(args, variant): #eval_env = gym.make('FetchReach-v1') #expl_env = gym.make('FetchReach-v1') core_env = sim.DeepBuilderEnv(args.session_name, args.act_dim, args.box_dim, args.max_num_boxes, args.height_field_dim) eval_env = stuff.NormalizedActions(core_env) expl_env = stuff.NormalizedActions(core_env) observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()