def generate_vae_dataset( N=10000, test_p=0.9, use_cached=False, imsize=84, show=False, dataset_path=None, env_class=SawyerReachTorqueEnv, env_kwargs=None, init_camera=sawyer_torque_reacher_camera, ): filename = "/tmp/sawyer_torque_data" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() if env_kwargs == None: env_kwargs = dict() env = env_class(**env_kwargs) env = ImageEnv( env, imsize, transpose=True, init_camera=init_camera, normalize=True, ) info['env'] = env policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) dataset = np.zeros((N, imsize * imsize * 3), dtype=np.uint8) for i in range(N): if i % 50 == 0: print('Reset') env.reset_model() exploration_policy.reset() for _ in range(1): action = exploration_policy.get_action()[0] * 1 / 10 env.wrapped_env.step(action) img = env._get_flat_img() dataset[i, :] = unormalize_image(img) if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) print("done making training data", time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def generate_vae_dataset( N=10000, test_p=0.9, use_cached=True, imsize=84, show=False, dataset_path=None, env_class=None, env_kwargs=None, init_camera=sawyer_door_env_camera, ): filename = "/tmp/sawyer_door_push_open_and_reach" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: env = env_class(**env_kwargs) env = ImageEnv( env, imsize, transpose=True, init_camera=init_camera, normalize=True, ) oracle_sampled_data = int(N/2) dataset = np.zeros((N, imsize * imsize * 3)) print('Goal Space Sampling') for i in range(oracle_sampled_data): goal = env.sample_goal() env.set_to_goal(goal) img = env._get_flat_img() dataset[i, :] = img if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) env._wrapped_env.min_y_pos=.6 policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) print('Random Sampling') for i in range(oracle_sampled_data, N): if i % 20==0: env.reset() exploration_policy.reset() for _ in range(10): action = exploration_policy.get_action()[0] env.wrapped_env.step( action ) img = env._get_flat_img() dataset[i, :] = img if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def generate_goal_data_set(env=None, num_goals=1000, use_cached_dataset=False, action_scale=1 / 10): if use_cached_dataset and osp.isfile('/tmp/goals' + str(num_goals) + '.npy'): goal_dict = np.load('/tmp/goals' + str(num_goals) + '.npy').item() print("loaded data from saved file") return goal_dict cached_goal_keys = [ 'latent_desired_goal', 'image_desired_goal', 'state_desired_goal', 'joint_desired_goal', ] goal_sizes = [ env.observation_space.spaces['latent_desired_goal'].low.size, env.observation_space.spaces['image_desired_goal'].low.size, env.observation_space.spaces['state_desired_goal'].low.size, 7 ] observation_keys = [ 'latent_observation', 'image_observation', 'state_observation', 'state_observation', ] goal_generation_dict = dict() for goal_key, goal_size, obs_key in zip( cached_goal_keys, goal_sizes, observation_keys, ): goal_generation_dict[goal_key] = [goal_size, obs_key] goal_dict = dict() policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) for goal_key in goal_generation_dict: goal_size, obs_key = goal_generation_dict[goal_key] goal_dict[goal_key] = np.zeros((num_goals, goal_size)) print('Generating Random Goals') for i in range(num_goals): if i % 50 == 0: print('Reset') env.reset_model() exploration_policy.reset() action = exploration_policy.get_action()[0] * action_scale obs, _, _, _ = env.step(action) print(i) for goal_key in goal_generation_dict: goal_size, obs_key = goal_generation_dict[goal_key] goal_dict[goal_key][i, :] = obs[obs_key] np.save('/tmp/goals' + str(num_goals) + '.npy', goal_dict) return goal_dict
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) # env = NormalizedBoxEnv(env) # tdm_normalizer = TdmNormalizer( # env, # vectorized=True, # max_tau=variant['algo_kwargs']['tdm_kwargs']['max_tau'], # ) tdm_normalizer = None qf = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = variant['algo_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = CartpoleEnv() env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), **variant['qf_params'], ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def get_td3pg(evaluation_environment, parameters): """ :param evaluation_environment: :param parameters: :return: """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) es = GaussianStrategy( action_space=evaluation_environment.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def get_ddpg(evaluation_environment, parameters): obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=evaluation_environment.action_space), policy=policy, ) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def experiment(variant): expl_env = gym.make("CartPole-v0") eval_env = gym.make("CartPole-v0") obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) target_qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, expl_policy) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=4) eval_env = CartPoleEnv(mode=4) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf1) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACDiscreteTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) env = NormalizedBoxEnv(create_swingup()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from rlkit.envs.gym_minigrid.gym_minigrid import envs expl_env = ToolsEnv(**variant['env_kwargs']) eval_env = ToolsEnv(**variant['env_kwargs']) rollout_env = ToolsEnv(**variant['env_kwargs']) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n layer_size = variant['algo_kwargs']['layer_size'] lifetime = variant['env_kwargs'].get('time_horizon', 0) == 0 if lifetime: assert eval_env.time_horizon == 0, 'cannot have time horizon for lifetime env' qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) target_qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) # eval_policy = SoftmaxQPolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedyDecay(expl_env.action_space, 1e-5, 1, 0.1), eval_policy, ) if lifetime: eval_policy = expl_policy # expl_policy = PolicyWrappedWithExplorationStrategy( # EpsilonGreedy(expl_env.action_space, 0.5), # eval_policy, # ) if eval_env.time_horizon == 0: collector_class = LifetimeMdpPathCollector if lifetime else MdpPathCollector else: collector_class = MdpPathCollectorConfig eval_path_collector = collector_class( eval_env, eval_policy, # render=True ) expl_path_collector = collector_class(expl_env, expl_policy) trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['algo_kwargs']['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['algo_kwargs']['replay_buffer_size'], expl_env) #algo_class = TorchLifetimeRLAlgorithm if lifetime else TorchBatchRLAlgorithm algo_class = TorchHumanInputLifetimeRLAlgorithm algorithm = algo_class(trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, rollout_env=rollout_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def tdm_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) tdm_normalizer = None qf1 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) qf2 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs'] ) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() algo_kwargs = variant['algo_kwargs'] algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmTd3( env, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_kwargs ) algorithm.to(ptu.device) algorithm.train()
def create_exploration_policy( env, policy, exploration_version='identity', repeat_prob=0., prob_random_action=0., exploration_noise=0., ): # TODO: merge with get_exploration_strategy if exploration_version == 'identity': return policy elif exploration_version == 'occasionally_repeat': return ActionRepeatPolicy(policy, repeat_prob=repeat_prob) elif exploration_version == 'epsilon_greedy': return PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy( action_space=env.action_space, prob_random_action=prob_random_action, ), policy=policy) elif exploration_version == 'epsilon_greedy_and_occasionally_repeat': policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy( action_space=env.action_space, prob_random_action=prob_random_action, ), policy=policy) return ActionRepeatPolicy(policy, repeat_prob=repeat_prob) elif exploration_version == 'epsilon_greedy': return PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy( action_space=env.action_space, prob_random_action=prob_random_action, ), policy=policy) elif exploration_version == 'ou': return PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, ), policy=policy) else: raise ValueError(exploration_version)
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n gb = TrafficGraphBuilder(input_dim=4, ego_init=torch.tensor([0.,1.]), other_init=torch.tensor([1.,0.]), edge_index=torch.tensor([[0,0,1,2], [1,2,0,0]])) qf = GNNNet( pre_graph_builder = gb, node_dim = 16, output_dim = action_dim, post_mlp_kwargs = variant['qf_kwargs'], num_conv_layers=3) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) replay_buffer = PrioritizedReplayBuffer( variant['replay_buffer_size'], expl_env, ) qf_criterion = nn.MSELoss() trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): ''' 1. 建立实验环境(eval, expl) 2. 确立输入,输出维度,建立qf函数,policy函数 3. 复制target qf和 target policy 函数 4. 对于评估构建path collector 5. 对于训练实验,构建探索策略、path collector、replay buffer 6. 构建 DDPGTrainer (qf, policy) 7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分) 8. 开始训练 :param variant: config parameter :return: ''' eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) # 利用copy target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) # 评估 eval_path_collector = MdpPathCollector(eval_env, policy) # 实验 (探索策略、path收集、replay buffer) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) # 转化变量格式 algorithm.to(ptu.device) algorithm.train()
def get_validation_returns(self, snapshot): policy = snapshot['evaluation/policy'] policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(self.eval_env.action_space, 0.1), policy) validation_envs = pickle.load(open(self.validation_envs_pkl, 'rb')) returns = np.zeros(len(validation_envs['envs'])) for env_idx, env in enumerate(validation_envs['envs']): path = rollout(env, policy, self.validation_rollout_length) returns[env_idx] = path['rewards'].sum() return {'returns': returns.mean()}
def experiment(variant): if variant['multitask']: env = CylinderXYPusher2DEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) else: env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) target_qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) qf_criterion = nn.MSELoss() eval_learner_policy = ArgmaxDiscretePolicy(qf) expl_learner_policy = PolicyWrappedWithExplorationStrategy( AnnealedEpsilonGreedy(symbolic_action_space, anneal_rate=variant["anneal_rate"]), eval_learner_policy, ) eval_policy = LearnPlanPolicy(eval_learner_policy) expl_policy = LearnPlanPolicy(expl_learner_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy, rollout=hierarchical_rollout) expl_path_collector = MdpPathCollector(expl_env, expl_policy, rollout=hierarchical_rollout) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('GoalGridworld-v0') eval_env = gym.make('GoalGridworld-v0') obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.n qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) target_qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) eval_policy = ArgmaxDiscretePolicy(qf) exploration_strategy = EpsilonGreedy(action_space=expl_env.action_space, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=exploration_strategy, policy=eval_policy, ) replay_buffer = ObsDictRelabelingBuffer(env=eval_env, **variant['replay_buffer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('SawyerReachXYZEnv-v0') es = GaussianAndEpislonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=env, achieved_goal_key='state_achieved_goal', desired_goal_key='state_desired_goal', **variant['replay_buffer_kwargs'] ) algorithm = HerTd3( replay_buffer=replay_buffer, her_kwargs=dict( observation_key='observation', desired_goal_key='desired_goal' ), td3_kwargs = dict( env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy ), **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # Select a different success_function for different tasks. expl_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) eval_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): args = getArgs() # expl_env = NormalizedBoxEnv(environment(args)) expl_env = environment(args, 'dqn') eval_env = environment(args, 'dqn') # expl_env.render() obs_dim = expl_env.get_obsdim() action_dim = expl_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()