def experiment(variant): cuda = True from gym.envs.mujoco import HalfCheetahEnv from mujoco_torch.core.bridge import MjCudaRender R = 84 env = HalfCheetahEnv() c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() gt.stamp("start") for i in range(100): img = env.sim.render(R, R, device_id=1) gt.stamp("warmstart") for i in gt.timed_for(range(1000)): env.step(np.random.rand(6)) gt.stamp('step') img = env.sim.render(R, R, device_id=1) gt.stamp('render') x = np_to_var(img) if cuda: x = x.cuda() torch.cuda.synchronize() gt.stamp('transfer') # cv2.imshow("img", img) # cv2.waitKey(1) gt.stamp("end") print(img) print(gt.report(include_itrs=False))
def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0): self.noise_type = noise_type assert self.noise_type in ['normal', 'uniform'] self.noise_scale = noise_scale self.init_scale = init_scale HalfCheetahEnv.__init__(self)
def experiment(variant): ''' 1. 建立实验环境(eval, expl) 2. 确立输入,输出维度,建立qf函数,policy函数 3. 复制target qf和 target policy 函数 4. 对于评估构建path collector 5. 对于训练实验,构建探索策略、path collector、replay buffer 6. 构建 DDPGTrainer (qf, policy) 7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分) 8. 开始训练 :param variant: config parameter :return: ''' eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) # 利用copy target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) # 评估 eval_path_collector = MdpPathCollector(eval_env, policy) # 实验 (探索策略、path收集、replay buffer) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) # 转化变量格式 algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy) eval_path_collector = MdpPathCollector(eval_env, policy) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from gym.envs.mujoco import HalfCheetahEnv from mujoco_torch.core.bridge import MjCudaRender renderer = MjCudaRender(32, 32) env = HalfCheetahEnv() renderer.get_cuda_tensor(env.sim)
def experiment(variant): env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def simulate_policy(args): data = torch.load(str(args.file)) #data = joblib.load(str(args.file)) policy = data['evaluation/policy'] env = NormalizedBoxEnv(HalfCheetahEnv()) #env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() if args.collect: data = [] for trial in tqdm(range(100)): path = rollout( env, policy, max_path_length=args.H + 1, render=not args.collect, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.collect: data.append([path['actions'], path['next_observations']]) if args.collect: import pickle with open("data/expert.pkl", mode='wb') as f: pickle.dump(data, f)
def experiment(variant): env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): import mujoco_py import torch logger.log(torch.__version__) date_format = '%m/%d/%Y %H:%M:%S %Z' date = datetime.now(tz=pytz.utc) logger.log("start") logger.log('Current date & time is: {}'.format(date.strftime(date_format))) if torch.cuda.is_available(): x = torch.randn(3) logger.log(str(x.to(ptu.device))) date = date.astimezone(timezone('US/Pacific')) logger.log('Local date & time is: {}'.format(date.strftime(date_format))) for i in range(variant['num_seconds']): logger.log("Tick, {}".format(i)) time.sleep(1) logger.log("end") logger.log('Local date & time is: {}'.format(date.strftime(date_format))) logger.log("start mujoco") from gym.envs.mujoco import HalfCheetahEnv e = HalfCheetahEnv() img = e.sim.render(32, 32) logger.log(str(sum(img))) logger.log("end mujocoy")
def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): root = 0 E = 20 R = 84 U = 6 cuda = True envs = [] for e in range(E): env = HalfCheetahEnv() envs.append(env) c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1) # env.sim.add_render_context(viewer) def step(i, stamp=True): imgs = [] if i % 100 == 0: for e in envs: e.reset() for e in envs: img = e.sim.render(R, R, device_id=0).transpose() imgs.append(img) gt.stamp('render') if stamp else 0 imgs = np.array(imgs) torch_img = np_to_var(imgs) if cuda: torch_img = torch_img.cuda() torch.cuda.synchronize() gt.stamp('transfer') if stamp else 0 u = get_numpy(c.forward(torch_img).cpu()) torch.cuda.synchronize() gt.stamp('forward') if stamp else 0 for i, e in enumerate(envs): e.step(u[i, :]) gt.stamp('step') if stamp else 0 for i in range(10): step(i, False) gt.stamp('start') for i in gt.timed_for(range(100)): step(i) gt.stamp('end') print(gt.report(include_itrs=False, format_options=dict(itr_num_width=10)))
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M]) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def test_trpo_pipeline(): with LocalRunner() as runner: env = GarageEnv(HalfCheetahEnv()) baseline = LinearFeatureBaseline() policy = GaussianMLPPolicy(env_spec=env.spec) algo = TRPO(policy=policy, baseline=baseline) runner.setup(algo=algo, env=env) runner.train(n_epochs=100, batch_size=512)
def run_exp(snapshot_config, *_): with LocalRunner(snapshot_config) as runner: env = GarageEnv(HalfCheetahEnv()) baseline = LinearFeatureBaseline() policy = GaussianMLPPolicy(env_spec=env.spec) algo = TRPO(policy=policy, baseline=baseline) runner.setup(algo=algo, env=env) runner.train(n_epochs=100, batch_size=512)
def experiment(variant): E = 10 R = 84 cuda = True envs = [] renderer = MjCudaRender(R, R) for e in range(E): env = HalfCheetahEnv() envs.append(env) c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1) # env.sim.add_render_context(viewer) def step(stamp=True): imgs = [] if i % 100 == 0: for e in range(E): envs[e].reset() for e in range(E): # img = renderer.get_cuda_tensor(envs[e].sim) img = envs[e].sim.render(R, R, device_id=1).transpose() gt.stamp('render') if stamp else 0 # imgs =np.array(imgs) # torch_img = np_to_var(imgs) # if cuda: # torch_img = torch_img.cuda() # torch.cuda.synchronize() # gt.stamp('transfer') if stamp else 0 # u = get_numpy(c.forward(torch_img).cpu()) # torch.cuda.synchronize() # gt.stamp('forward') if stamp else 0 # for e in range(E): # envs[e].step(u[e, :]) # gt.stamp('step') if stamp else 0 for i in range(10): step(False) gt.stamp('start') for i in gt.timed_for(range(100)): step() gt.stamp('end')
def experiment(variant): from gym.envs.mujoco import HalfCheetahEnv from mujoco_torch.core.bridge import MjCudaRender renderer = MjCudaRender(84, 84) env = HalfCheetahEnv() gt.stamp("start") for i in range(100): tensor, img = renderer.get_cuda_tensor(env.sim, False) gt.stamp("warmstart") for i in range(1000): env.step(np.random.rand(6)) tensor, img = renderer.get_cuda_tensor(env.sim, True) x = np_to_var(img).cuda() torch.cuda.synchronize() # cv2.imshow("img", img) # cv2.waitKey(1) gt.stamp("end") print(img) print(gt.report())
def experiment(variant): from gym.envs.mujoco import HalfCheetahEnv from mujoco_torch.core.bridge import MjCudaRender renderer = MjCudaRender(84, 84) env = HalfCheetahEnv() gt.stamp("start") for i in range(100): tensor, img = renderer.get_cuda_tensor(env.sim, False) gt.stamp("warmstart") for i in gt.timed_for(range(1000)): env.step(np.random.rand(6)) gt.stamp('step') tensor, img = renderer.get_cuda_tensor(env.sim, False) gt.stamp('render') # cv2.imshow("img", img) # cv2.waitKey(1) gt.stamp("end") print(img) print(gt.report(include_itrs=False))
def run_exp(*_): with LocalRunner() as runner: env = GarageEnv(HalfCheetahEnv()) # q-functions qf1 = ContinuousMLPQFunction(env_spec=env.spec) qf2 = ContinuousMLPQFunction(env_spec=env.spec) # replay buffer replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) # policy policy = GaussianMLPPolicy(env_spec=env.spec) # algorithm algo = SAC( env_spec=env.spec, policy=policy, qfs=[qf1, qf2], replay_buffer=replay_buffer, ) # setup and train runner.setup(algo, env) runner.train(n_epochs=100, batch_size=1000)
def simulate_policy(args): data = torch.load(str(args.file)) #data = joblib.load(str(args.file)) policy = data['evaluation/policy'] env = NormalizedBoxEnv(HalfCheetahEnv()) #env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() while True: path = rollout( env, policy, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def example(variant): import torch import rlkit.torch.pytorch_util as ptu print("Starting") logger.log(torch.__version__) date_format = "%m/%d/%Y %H:%M:%S %Z" date = datetime.now(tz=pytz.utc) logger.log("start") logger.log("Current date & time is: {}".format(date.strftime(date_format))) logger.log("Cuda available: {}".format(torch.cuda.is_available())) if torch.cuda.is_available(): x = torch.randn(3) logger.log(str(x.to(ptu.device))) date = date.astimezone(timezone("US/Pacific")) logger.log("Local date & time is: {}".format(date.strftime(date_format))) for i in range(variant["num_seconds"]): logger.log("Tick, {}".format(i)) time.sleep(1) logger.log("end") logger.log("Local date & time is: {}".format(date.strftime(date_format))) logger.log("start mujoco") from gym.envs.mujoco import HalfCheetahEnv e = HalfCheetahEnv() img = e.sim.render(32, 32) logger.log(str(sum(img))) logger.log("end mujoco") logger.record_tabular("Epoch", 1) logger.dump_tabular() logger.record_tabular("Epoch", 2) logger.dump_tabular() logger.record_tabular("Epoch", 3) logger.dump_tabular() print("Done")
def experiment(variant): env = NormalizedBoxEnv(HalfCheetahEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = PERTD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self, **kwargs): HalfCheetahEnv.__init__(self,) offline_env.OfflineEnv.__init__(self, **kwargs)
x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = F.relu(self.bn3(self.conv3(x))) h = x.view(-1, 128) # flatten return self.output_activation(self.fc1(h)) if __name__ == "__main__": E = 10 R = 84 cuda = True envs = [] for e in range(E): env = HalfCheetahEnv() envs.append(env) c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1) # env.sim.add_render_context(viewer) def step(stamp=True): imgs = [] if i % 100 == 0: for e in range(E): envs[e].reset() for e in range(E): img = envs[e].sim.render(R, R, device_id=1).transpose()
def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size skills_dim = variant['skills_dim'] # Define the networks M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim + skills_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + skills_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + skills_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + skills_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim + skills_dim, action_dim=action_dim, hidden_sizes=[M, M], ) higher_level_policy = categorical_mlp.CategoricalMLPPolicy( input_size=obs_dim, output_size=skills_dim, hidden_sizes=(M, M), ) value_function = FlattenMlp( hidden_sizes=[M, M], input_size=obs_dim, output_size=1, ) discriminator_function = FlattenMlp( hidden_sizes=[M, M], input_size=obs_dim, output_size=skills_dim ) target_vf = FlattenMlp( hidden_sizes=[M, M], input_size=obs_dim, output_size=1, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, higher_level_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, higher_level_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = emp_skills_trainer.EmpowermentSkillsTrainer( env=eval_env, higher_level_policy=higher_level_policy, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_vf=target_vf, value_function=value_function, discriminator=discriminator_function, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # expl_env = NormalizedBoxEnv(gym.make('activesearchrl-v0')) # eval_env = NormalizedBoxEnv(gym.make('activesearchrl-v0')) # obs_dim = expl_env.observation_space.low.size # action_dim = eval_env.action_space.low.size expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) # policy = TanhGaussianPolicy( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_sizes=[M, M], # ) policy = GaussianMixturePolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], num_gaussians=2) # data = torch.load('/Users/conor/Documents/PHD_RESEARCH/ACTIVE_SEARCH_AS_RL/rlkit/data/tabular-active-search-k1/tabular_active_search_k1_2020_11_10_16_18_25_0000--s-0/params.pkl') # qf1 = data['trainer/qf1'] # qf2 = data['trainer/qf2'] # target_qf1 = data['trainer/target_qf1'] # target_qf2 = data['trainer/target_qf2'] # policy = data['trainer/policy'] eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self): self._base_pos = 15. HalfCheetahEnv.__init__(self)
def experiment(variant): comm = MPI.COMM_WORLD rank = comm.Get_rank() n_proc = comm.Get_size() root = 0 gpus = GPUtil.getGPUs() n_gpu = len(gpus) torch.distributed.init_process_group(backend='mpi', world_size=n_proc) E = 20 R = 84 U = 6 cuda = True envs = [] for e in range(rank, E, n_proc): env = HalfCheetahEnv() envs.append(env) sendcounts = np.array(comm.gather(len(envs), root)) i_sendcounts = None u_sendcounts = None if rank == root: i_sendcounts = sendcounts * 3 * R * R u_sendcounts = sendcounts * U c = Convnet(6, output_activation=torch.tanh, input_channels=3) c = torch.nn.parallel.DistributedDataParallel(c) if cuda: c.cuda() # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1) # env.sim.add_render_context(viewer) def step(i, stamp=True): imgs = [] if i % 100 == 0: for e in envs: e.reset() for e in envs: img = e.sim.render(R, R, device_id=rank % n_gpu).transpose() imgs.append(img) comm.Barrier() if rank == 0: gt.stamp('render') if stamp else 0 imgs = np.array(imgs) r_imgs = None if rank == 0: r_imgs = np.empty((E, 3, R, R), dtype='uint8') comm.Gatherv(sendbuf=imgs, recvbuf=(r_imgs, i_sendcounts), root=root) if rank == 0: gt.stamp('comm1') if stamp else 0 u = None if rank == 0: torch_img = np_to_var(r_imgs) if cuda: torch_img = torch_img.cuda() torch.cuda.synchronize() gt.stamp('transfer') if stamp else 0 u = get_numpy(c.forward(torch_img).cpu()) torch.cuda.synchronize() gt.stamp('forward') if stamp else 0 r_u = np.empty((len(envs), U), dtype='float32') comm.Scatterv(sendbuf=(u, u_sendcounts), recvbuf=r_u, root=root) if rank == 0: gt.stamp('comm2') if stamp else 0 for i, e in enumerate(envs): e.step(r_u[i, :]) comm.Barrier() if rank == 0: gt.stamp('step') if stamp else 0 for i in range(10): step(i, False) if rank == 0: gt.stamp('start') for i in gt.timed_for(range(100)): step(i) if rank == 0: gt.stamp('end') print( gt.report(include_itrs=False, format_options=dict(itr_num_width=10)))
def experiment(variant): # Or for a specific version (Daniel: doesn't work): # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) if 'Ant' in args.env: expl_env = NormalizedBoxEnv(AntEnv()) eval_env = NormalizedBoxEnv(AntEnv()) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv(InvertedPendulumEnv()) eval_env = NormalizedBoxEnv(InvertedPendulumEnv()) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv(HopperEnv()) eval_env = NormalizedBoxEnv(HopperEnv()) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv(ReacherEnv()) eval_env = NormalizedBoxEnv(ReacherEnv()) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv(SwimmerEnv()) eval_env = NormalizedBoxEnv(SwimmerEnv()) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv(Walker2dEnv()) eval_env = NormalizedBoxEnv(Walker2dEnv()) else: raise ValueError(args.env) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ th = 1.8 g_max = 0.1 #delta = 1e-7 if args.env == 'CartPole': #CartPole env = TfEnv(normalize(CartPoleEnv())) runner = LocalRunner(snapshot_config) batch_size = 5000 max_length = 100 n_timestep = 5e5 n_counts = 5 name = 'CartPole' grad_factor = 5 th = 1.2 #batchsize: 1 # lr = 0.1 # w = 2 # c = 50 #batchsize: 50 lr = 0.75 c = 3 w = 2 discount = 0.995 path = './init/CartPole_policy.pth' if args.env == 'Walker': #Walker_2d env = TfEnv(normalize(Walker2dEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 2 c = 12 grad_factor = 6 discount = 0.999 name = 'Walk' path = './init/Walk_policy.pth' if args.env == 'HalfCheetah': env = TfEnv(normalize(HalfCheetahEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.6 w = 1 c = 4 grad_factor = 5 th = 1.2 g_max = 0.06 discount = 0.999 name = 'HalfCheetah' path = './init/HalfCheetah_policy.pth' if args.env == 'Hopper': #Hopper env = TfEnv(normalize(HopperEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 1000 th = 1.5 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 1 c = 3 grad_factor = 6 g_max = 0.15 discount = 0.999 name = 'Hopper' path = './init/Hopper_policy.pth' for i in range(n_counts): # print(env.spec) if args.env == 'CartPole': policy = CategoricalMLPPolicy(env.spec, hidden_sizes=[8, 8], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) else: policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy.load_state_dict(torch.load(path)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MBPG_HA(env_spec=env.spec, env = env, env_name= name, policy=policy, baseline=baseline, max_path_length=max_length, discount=discount, grad_factor=grad_factor, policy_lr= lr, c = c, w = w, th=th, g_max=g_max, n_timestep=n_timestep, batch_size=batch_size, center_adv=True, # delta=delta #decay_learning_rate=d_lr, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def experiment(variant): cuda = True ingpu = False R = 84 E = 100 N = 100 if ingpu: from mujoco_torch.core.bridge import MjCudaRender renderer = MjCudaRender(84, 84, E) envs = [] for e in range(E): env = HalfCheetahEnv() envs.append(env) c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() def step(stamp=True): for e in range(E): env = envs[e] env.step(np.random.rand(6)) gt.stamp('step') if stamp else 0 if ingpu: sims = [env.sim for env in envs] env = envs[e] tensor, img = renderer.get_batch_cuda_tensor(sims, False) tensor = Variable(tensor).float() gt.stamp('render') if stamp else 0 else: imgs = [] for e in range(E): env = envs[e] img = env.sim.render(R, R, device_id=1) imgs.append(img) gt.stamp('render') if stamp else 0 imgs = np.array(imgs) tensor = np_to_var(imgs) if cuda: tensor = tensor.cuda() torch.cuda.synchronize() gt.stamp('transfer') if stamp else 0 u = get_numpy(c.forward(tensor).cpu()) torch.cuda.synchronize() gt.stamp('forward') if stamp else 0 # cv2.imshow("img", img) # cv2.waitKey(1) gt.stamp("start") for i in range(10): step(False) gt.stamp("warmstart") for i in gt.timed_for(range(N)): step() gt.stamp("end") print(gt.report(include_itrs=False))