def test_agent(self): policy = DeterministicPolicy( copy.deepcopy(self.policy_model), None, self.action_space, ) return TimeFeature(DDPGTestAgent(policy))
def main(): parser = argparse.ArgumentParser(description="Watch a continuous agent.") parser.add_argument("env", help="ID of the Environment") parser.add_argument("dir", help="Directory where the agent's model was saved.") parser.add_argument( "--device", default="cpu", help= "The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) parser.add_argument( "--fps", default=120, help="Playback speed", ) args = parser.parse_args() if args.env in ENVS: env_id = ENVS[args.env] else: env_id = args.env env = GymEnvironment(env_id, device=args.device) agent = TimeFeature(GreedyAgent.load(args.dir, env)) watch(agent, env, fps=args.fps)
def _ppo(envs, writer=DummyWriter()): final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) env = envs[0] feature_model, value_model, policy_model = fc_actor_critic(env) feature_model.to(device) value_model.to(device) policy_model.to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = GaussianPolicy( policy_model, policy_optimizer, env.action_space, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) features = Identity(self.device) v = VNetwork( self.value_model, value_optimizer, loss_scaling=self.hyperparameters['value_loss_scaling'], clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), ) policy = GaussianPolicy( self.policy_model, policy_optimizer, self.action_space, clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters['clip_initial'], self.hyperparameters['clip_final'], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters['epochs'], minibatches=self.hyperparameters['minibatches'], n_envs=self.hyperparameters['n_envs'], n_steps=self.hyperparameters['n_steps'], discount_factor=self.hyperparameters['discount_factor'], lam=self.hyperparameters['lam'], entropy_loss_scaling=self. hyperparameters['entropy_loss_scaling'], writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters["replay_start_size"] ) / self.hyperparameters["update_frequency"] q_optimizer = Adam(self.q_model.parameters(), lr=self.hyperparameters["lr_q"]) q = QContinuous(self.q_model, q_optimizer, target=PolyakTarget( self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR(q_optimizer, n_updates), writer=writer) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = DeterministicPolicy( self.policy_model, policy_optimizer, self.action_space, target=PolyakTarget(self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR(policy_optimizer, n_updates), writer=writer) replay_buffer = ExperienceReplayBuffer( self.hyperparameters["replay_buffer_size"], device=self.device) return TimeFeature( DDPG( q, policy, replay_buffer, self.action_space, noise=self.hyperparameters["noise"], replay_start_size=self.hyperparameters["replay_start_size"], discount_factor=self.hyperparameters["discount_factor"], update_frequency=self.hyperparameters["update_frequency"], minibatch_size=self.hyperparameters["minibatch_size"], ))
def _ddpg(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency q_model = fc_q(env).to(device) q_optimizer = Adam(q_model.parameters(), lr=lr_q) q = QContinuous(q_model, q_optimizer, target=PolyakTarget(polyak_rate), scheduler=CosineAnnealingLR(q_optimizer, final_anneal_step), writer=writer) policy_model = fc_deterministic_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = DeterministicPolicy(policy_model, policy_optimizer, env.action_space, target=PolyakTarget(polyak_rate), scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( DDPG( q, policy, replay_buffer, env.action_space, noise=noise, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, ))
def setUp(self): torch.manual_seed(2) self.test_agent = TestAgent() self.agent = TimeFeature(self.test_agent)
def _sac(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency q_1_model = fc_q(env).to(device) q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) q_1 = QContinuous(q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR(q_1_optimizer, final_anneal_step), writer=writer, name='q_1') q_2_model = fc_q(env).to(device) q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) q_2 = QContinuous(q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR(q_2_optimizer, final_anneal_step), writer=writer, name='q_2') v_model = fc_v(env).to(device) v_optimizer = Adam(v_model.parameters(), lr=lr_v) v = VNetwork( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) policy_model = fc_soft_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = SoftDeterministicPolicy(policy_model, policy_optimizer, env.action_space, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( SAC(policy, q_1, q_2, v, replay_buffer, temperature_initial=temperature_initial, entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, writer=writer))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters["replay_start_size"] ) / self.hyperparameters["update_frequency"] q_1_optimizer = Adam(self.q_1_model.parameters(), lr=self.hyperparameters["lr_q"]) q_1 = QContinuous(self.q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR(q_1_optimizer, n_updates), writer=writer, name='q_1') q_2_optimizer = Adam(self.q_2_model.parameters(), lr=self.hyperparameters["lr_q"]) q_2 = QContinuous(self.q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR(q_2_optimizer, n_updates), writer=writer, name='q_2') v_optimizer = Adam(self.v_model.parameters(), lr=self.hyperparameters["lr_v"]) v = VNetwork( self.v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, n_updates), target=PolyakTarget(self.hyperparameters["polyak_rate"]), writer=writer, name='v', ) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = SoftDeterministicPolicy(self.policy_model, policy_optimizer, self.action_space, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), writer=writer) replay_buffer = ExperienceReplayBuffer( self.hyperparameters["replay_buffer_size"], device=self.device) return TimeFeature( SAC(policy, q_1, q_2, v, replay_buffer, temperature_initial=self. hyperparameters["temperature_initial"], entropy_target=( -self.action_space.shape[0] * self.hyperparameters["entropy_target_scaling"]), lr_temperature=self.hyperparameters["lr_temperature"], replay_start_size=self.hyperparameters["replay_start_size"], discount_factor=self.hyperparameters["discount_factor"], update_frequency=self.hyperparameters["update_frequency"], minibatch_size=self.hyperparameters["minibatch_size"], writer=writer))
def test_agent(self): policy = SoftDeterministicPolicy(copy.deepcopy(self.policy_model), space=self.action_space) return TimeFeature(SACTestAgent(policy))
def test_agent(self): policy = GaussianPolicy(copy.deepcopy(self.policy_model), space=self.action_space) return TimeFeature(PPOTestAgent(Identity(self.device), policy))
def _sac(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency v_model = v_model_constructor(env).to(device) q_1_model = q1_model_constructor(env).to(device) q_2_model = q2_model_constructor(env).to(device) #quick and dirty implementation of parallel branch un/freeze policy_model = policy_model_constructor( env=env, train_parallel=train_parallel).to(device) if pretrained_models is not None: q_1_model = pretrained_models.q_1.model.to(device) q_2_model = pretrained_models.q_2.model.to(device) v_model = pretrained_models.v.model.to(device) policy_model = pretrained_models.policy.model.to(device) q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) q_1 = QContinuousCtrlRep(q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR( q_1_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer, name='q_1') q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) q_2 = QContinuousCtrlRep(q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR( q_2_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer, name='q_2') v_optimizer = Adam(v_model.parameters(), lr=lr_v) v = VNetworkCtrlRep( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) policy_optimizer = Adam(filter(lambda p: p.requires_grad, policy_model.parameters()), lr=lr_pi) policy = SoftDeterministicPolicyCtrlRep(policy_model, policy_optimizer, env.action_space, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( SACCtrlRep(policy=policy, q_1=q_1, q_2=q_2, v=v, replay_buffer=replay_buffer, temperature_initial=temperature_initial, entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, writer=writer))