def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2): self.gamma = gamma self.n_actions = n_actions self.actions = None self.action_space = [i for i in range(self.n_actions)] self.actor_critic = ActorCriticNetwork(n_actions=n_actions) self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))
class Agent(object): def __init__(self, in_dims, out_dim, lr=0.000005, gamma=0.99): self.gamma = gamma self.lr = lr self.net = ActorCriticNetwork(in_dims, out_dim, self.lr) self.log_prob = None def get_action(self, observation): tensor = torch.tensor([observation], dtype=torch.float) state = tensor.to(self.net.device) policy, _ = self.net.forward(state) policy = fuc.softmax(policy, dim=1) probabilities = torch.distributions.Categorical(policy) action = probabilities.sample() self.log_prob = probabilities.log_prob(action) return action.item() def learn(self, state, reward, next_state, done): self.net.optimizer.zero_grad() tensor = torch.tensor([state], dtype=torch.float) state = tensor.to(self.net.device) tensor = torch.tensor([next_state], dtype=torch.float) next_state = tensor.to(self.net.device) tensor = torch.tensor([reward], dtype=torch.float) reward = tensor.to(self.net.device) _, value = self.net.forward(state) _, next_value = self.net.forward(next_state) delta = reward + self.gamma * next_value * (1 - int(done)) - value actor_loss = -self.log_prob * delta critic_loss = delta**2 (actor_loss + critic_loss).backward() self.net.optimizer.step()
class Agent: def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2): self.gamma = gamma self.n_actions = n_actions self.action = None self.action_space = [i for i in range(self.n_actions)] self.actor_critic = ActorCriticNetwork(n_actions=n_actions) self.actor_critic.compile(optimizer=Adam(learning_rate=alpha)) def choose_action(self, observation): state = tf.convert_to_tensor([observation]) _, probs = self.actor_critic(state) action_probabilities = tfp.distributions.Categorical(probs=probs) action = action_probabilities.sample() log_prob = action_probabilities.log_prob(action) self.action = action return action.numpy()[0] def save_models(self): print('... saving models ...') self.actor_critic.save_weights(self.actor_critic.checkpoint_file) def load_models(self): print('... loading models ...') self.actor_critic.load_weights(self.actor_critic.checkpoint_file) def learn(self, state, reward, state_, done): state = tf.convert_to_tensor([state], dtype=tf.float32) state_ = tf.convert_to_tensor([state_], dtype=tf.float32) reward = tf.convert_to_tensor(reward, dtype=tf.float32) # not fed to NN with tf.GradientTape(persistent=True) as tape: state_value, probs = self.actor_critic(state) state_value_, _ = self.actor_critic(state_) state_value = tf.squeeze( state_value ) #Removes dimensions of size 1 from the shape of a tensor. state_value_ = tf.squeeze(state_value_) action_probs = tfp.distributions.Categorical(probs=probs) log_prob = action_probs.log_prob(self.action) delta = reward + self.gamma * state_value_ * ( 1 - int(done)) - state_value actor_loss = -log_prob * delta critic_loss = delta**2 total_loss = actor_loss + critic_loss gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables) self.actor_critic.optimizer.apply_gradients( zip(gradient, self.actor_critic.trainable_variables))
class Agent: def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2): self.gamma = gamma self.n_actions = n_actions self.actions = None self.action_space = [i for i in range(self.n_actions)] self.actor_critic = ActorCriticNetwork(n_actions=n_actions) self.actor_critic.compile(optimizer=Adam(learning_rate=alpha)) def choose_action(self, observation): state = tf.convert_to_tensor([observation]) # add batch dimension _, probs = self.actor_critic(state) action_probabilities = tfp.distributions.Categorical(probs=probs) action = action_probabilities.sample() self.action = action return action.numpy()[0] # remove batch dimension def save_model(self): print('Saving model.') self.actor_critic.save_weights(self.actor_critic.checkpoint_file) def load_model(self): print('Loading model') self.actor_critic.load_weights(self.actor_critic.checkpoint_file) def learn(self, state, reward, next_state, done): state = tf.convert_to_tensor([state], dtype=tf.float32) next_state = tf.convert_to_tensor([next_state], dtype=tf.float32) reward = tf.convert_to_tensor(reward, dtype=tf.float32) with tf.GradientTape() as tape: state_value, probs = self.actor_critic(state) next_state_value, _ = self.actor_critic(next_state) state_value = tf.squeeze(state_value) next_state_value = tf.squeeze(next_state_value) action_probs = tfp.distributions.Categorical(probs=probs) log_prob = action_probs.log_prob(self.action) delta = reward + self.gamma * next_state_value * ( 1 - int(done)) - state_value actor_loss = -log_prob * delta critic_loss = delta**2 total_loss = actor_loss + critic_loss gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables) self.actor_critic.optimizer.apply_gradients( zip(gradient, self.actor_critic.trainable_variables))
def __init__(self, **args): cuda = not args['no_cuda'] and torch.cuda.is_available() self.device = torch.device("cuda:0" if cuda else "cpu") print("Model running on device: {}".format(self.device)) torch.set_num_threads(1) self.env_name = args['env_name'] self.epochs = args['epochs'] self.num_processes = args['num_processes'] self.num_steps = args['num_steps'] self.num_test_episodes = args['num_test_episodes'] self.test_every_n_epochs = args['test_every_n_epochs'] self.use_deterministic_policy_while_testing = args['use_deterministic_policy_while_testing'] self.grayscale = args['grayscale'] self.skip_frame = args['skip_frame'] self.num_frame_stack = args['num_frame_stack'] self.num_updates_per_epoch = args['num_updates_per_epoch'] self.num_steps = args['num_steps'] self.use_gae = args['use_gae'] self.gamma = args['gamma'] self.tau = args['tau'] self.reward_scaling = args['reward_scaling'] self.seed = args['seed'] self.log_dir = args['log_dir'] self.save_dir = args['save_dir'] try: os.makedirs(args['log_dir']) files = glob.glob(os.path.join(args['log_dir'], '*.manifest.json')) for f in files: os.remove(f) except OSError: files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv')) for f in files: os.remove(f) self.eval_log_dir = args['log_dir'] + "_eval" try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) self.envs = make_vec_envs(self.env_name, self.seed, self.num_processes, self.gamma, self.log_dir, self.device, False, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack) self.algorithm = args['algorithm'] # Decreasing LR scheduler self.scheduler = None if self.algorithm == 'A2C': actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = A2C(actor_critic, **args['algorithm_parameters']) elif self.algorithm == 'PPO': if(args['decreasing_lr']): def lambdalr(epoch): return ((float(self.epochs - epoch)) / float(self.epochs) * args['algorithm_parameters']['lr']) # noqa: E704 actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = PPO(actor_critic, lambdalr, ** args['algorithm_parameters']) self.scheduler = self.agent.scheduler else: actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = PPO(actor_critic, None, ** args['algorithm_parameters']) self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, actor_critic.recurrent_hidden_state_size) obs = self.envs.reset() self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) self.episode_rewards = deque(maxlen=50) self.writer = SummaryWriter( comment="{}-{}".format(self.env_name, self.algorithm))
def __init__(self, in_dims, out_dim, lr=0.000005, gamma=0.99): self.gamma = gamma self.lr = lr self.net = ActorCriticNetwork(in_dims, out_dim, self.lr) self.log_prob = None