Example #1
0
    def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
        self.gamma = gamma
        self.n_actions = n_actions
        self.actions = None
        self.action_space = [i for i in range(self.n_actions)]

        self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
        self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))
class Agent(object):
    def __init__(self, in_dims, out_dim, lr=0.000005, gamma=0.99):
        self.gamma = gamma
        self.lr = lr
        self.net = ActorCriticNetwork(in_dims, out_dim, self.lr)
        self.log_prob = None

    def get_action(self, observation):
        tensor = torch.tensor([observation], dtype=torch.float)
        state = tensor.to(self.net.device)
        policy, _ = self.net.forward(state)
        policy = fuc.softmax(policy, dim=1)
        probabilities = torch.distributions.Categorical(policy)
        action = probabilities.sample()
        self.log_prob = probabilities.log_prob(action)
        return action.item()

    def learn(self, state, reward, next_state, done):
        self.net.optimizer.zero_grad()

        tensor = torch.tensor([state], dtype=torch.float)
        state = tensor.to(self.net.device)
        tensor = torch.tensor([next_state], dtype=torch.float)
        next_state = tensor.to(self.net.device)
        tensor = torch.tensor([reward], dtype=torch.float)
        reward = tensor.to(self.net.device)

        _, value = self.net.forward(state)
        _, next_value = self.net.forward(next_state)

        delta = reward + self.gamma * next_value * (1 - int(done)) - value

        actor_loss = -self.log_prob * delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()
        self.net.optimizer.step()
Example #3
0
class Agent:
    def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
        self.gamma = gamma
        self.n_actions = n_actions
        self.action = None
        self.action_space = [i for i in range(self.n_actions)]

        self.actor_critic = ActorCriticNetwork(n_actions=n_actions)

        self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        _, probs = self.actor_critic(state)

        action_probabilities = tfp.distributions.Categorical(probs=probs)
        action = action_probabilities.sample()
        log_prob = action_probabilities.log_prob(action)
        self.action = action

        return action.numpy()[0]

    def save_models(self):
        print('... saving models ...')
        self.actor_critic.save_weights(self.actor_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor_critic.load_weights(self.actor_critic.checkpoint_file)

    def learn(self, state, reward, state_, done):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        state_ = tf.convert_to_tensor([state_], dtype=tf.float32)
        reward = tf.convert_to_tensor(reward,
                                      dtype=tf.float32)  # not fed to NN
        with tf.GradientTape(persistent=True) as tape:
            state_value, probs = self.actor_critic(state)
            state_value_, _ = self.actor_critic(state_)
            state_value = tf.squeeze(
                state_value
            )  #Removes dimensions of size 1 from the shape of a tensor.
            state_value_ = tf.squeeze(state_value_)

            action_probs = tfp.distributions.Categorical(probs=probs)
            log_prob = action_probs.log_prob(self.action)

            delta = reward + self.gamma * state_value_ * (
                1 - int(done)) - state_value
            actor_loss = -log_prob * delta
            critic_loss = delta**2
            total_loss = actor_loss + critic_loss

        gradient = tape.gradient(total_loss,
                                 self.actor_critic.trainable_variables)
        self.actor_critic.optimizer.apply_gradients(
            zip(gradient, self.actor_critic.trainable_variables))
Example #4
0
class Agent:
    def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
        self.gamma = gamma
        self.n_actions = n_actions
        self.actions = None
        self.action_space = [i for i in range(self.n_actions)]

        self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
        self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])  # add batch dimension
        _, probs = self.actor_critic(state)

        action_probabilities = tfp.distributions.Categorical(probs=probs)
        action = action_probabilities.sample()
        self.action = action

        return action.numpy()[0]  # remove batch dimension

    def save_model(self):
        print('Saving model.')
        self.actor_critic.save_weights(self.actor_critic.checkpoint_file)

    def load_model(self):
        print('Loading model')
        self.actor_critic.load_weights(self.actor_critic.checkpoint_file)

    def learn(self, state, reward, next_state, done):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        next_state = tf.convert_to_tensor([next_state], dtype=tf.float32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)

        with tf.GradientTape() as tape:
            state_value, probs = self.actor_critic(state)
            next_state_value, _ = self.actor_critic(next_state)

            state_value = tf.squeeze(state_value)
            next_state_value = tf.squeeze(next_state_value)

            action_probs = tfp.distributions.Categorical(probs=probs)
            log_prob = action_probs.log_prob(self.action)

            delta = reward + self.gamma * next_state_value * (
                1 - int(done)) - state_value
            actor_loss = -log_prob * delta
            critic_loss = delta**2

            total_loss = actor_loss + critic_loss

        gradient = tape.gradient(total_loss,
                                 self.actor_critic.trainable_variables)
        self.actor_critic.optimizer.apply_gradients(
            zip(gradient, self.actor_critic.trainable_variables))
Example #5
0
    def __init__(self, **args):
        cuda = not args['no_cuda'] and torch.cuda.is_available()
        self.device = torch.device("cuda:0" if cuda else "cpu")
        print("Model running on device: {}".format(self.device))
        torch.set_num_threads(1)

        self.env_name = args['env_name']
        self.epochs = args['epochs']
        self.num_processes = args['num_processes']
        self.num_steps = args['num_steps']
        self.num_test_episodes = args['num_test_episodes']
        self.test_every_n_epochs = args['test_every_n_epochs']
        self.use_deterministic_policy_while_testing = args['use_deterministic_policy_while_testing']

        self.grayscale = args['grayscale']
        self.skip_frame = args['skip_frame']
        self.num_frame_stack = args['num_frame_stack']

        self.num_updates_per_epoch = args['num_updates_per_epoch']
        self.num_steps = args['num_steps']

        self.use_gae = args['use_gae']
        self.gamma = args['gamma']
        self.tau = args['tau']

        self.reward_scaling = args['reward_scaling']

        self.seed = args['seed']
        self.log_dir = args['log_dir']
        self.save_dir = args['save_dir']

        try:
            os.makedirs(args['log_dir'])
            files = glob.glob(os.path.join(args['log_dir'], '*.manifest.json'))
            for f in files:
                os.remove(f)
        except OSError:
            files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv'))
            for f in files:
                os.remove(f)

        self.eval_log_dir = args['log_dir'] + "_eval"

        try:
            os.makedirs(self.eval_log_dir)
        except OSError:
            files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)

        self.envs = make_vec_envs(self.env_name, self.seed, self.num_processes,
                                  self.gamma, self.log_dir, self.device, False, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack)

        self.algorithm = args['algorithm']
        # Decreasing LR scheduler
        self.scheduler = None

        if self.algorithm == 'A2C':
            actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                              base_kwargs=args['policy_parameters'])
            actor_critic.to(self.device)
            self.policy = actor_critic
            self.agent = A2C(actor_critic, **args['algorithm_parameters'])

        elif self.algorithm == 'PPO':
            if(args['decreasing_lr']):
                def lambdalr(epoch): return ((float(self.epochs - epoch)) / float(self.epochs) * args['algorithm_parameters']['lr'])  # noqa: E704
                actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                                  base_kwargs=args['policy_parameters'])
                actor_critic.to(self.device)
                self.policy = actor_critic
                self.agent = PPO(actor_critic, lambdalr, **
                                 args['algorithm_parameters'])
                self.scheduler = self.agent.scheduler
            else:
                actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                                  base_kwargs=args['policy_parameters'])
                actor_critic.to(self.device)
                self.policy = actor_critic
                self.agent = PPO(actor_critic, None, **
                                 args['algorithm_parameters'])

        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.envs.observation_space.shape, self.envs.action_space,
                                       actor_critic.recurrent_hidden_state_size)
        obs = self.envs.reset()
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)
        self.episode_rewards = deque(maxlen=50)
        self.writer = SummaryWriter(
            comment="{}-{}".format(self.env_name, self.algorithm))
 def __init__(self, in_dims, out_dim, lr=0.000005, gamma=0.99):
     self.gamma = gamma
     self.lr = lr
     self.net = ActorCriticNetwork(in_dims, out_dim, self.lr)
     self.log_prob = None