class A3CAgent(object):
    def __init__(self):
        self.num_state = OBS_SPACE  # observation size
        self.num_actions = NUM_ACTIONS  # number of actions
        self.lr = tf.Variable(3e-4)  # variable used for decaying learning rate
        self.starter_lr = 3e-4  # start value of learning rate

        # optimizer that trains the global network with the gradients of the locals
        # use locking because multiple threads
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr,
                                                use_locking=True)

        # the global Actor-Critic network
        self.global_network = Actor_Critic(self.num_actions)
        # prepare the global network - used to construct the network on eager execution
        self.global_network(
            tf.convert_to_tensor(np.random.random((1, 84, 84, 4)),
                                 dtype=tf.float32))

        self.discount_rate = 0.99

    def start_threads(self):
        # max number of episodes
        max_eps = 1e6
        envs = []
        # create 1 local enviroment for each thread
        for _ in range(NUM_THREADS):
            _env = gym_super_mario_bros.make(env_name)
            _env = JoypadSpace(_env, SIMPLE_MOVEMENT)
            env = atari_wrapper.wrap_dqn(_env)
            envs.append(env)
        # create the threads and assign them their enviroment and exploration rate
        threads = []
        for i in range(NUM_THREADS):
            thread = threading.Thread(
                target=train_thread,
                daemon=True,
                args=(self, max_eps, envs[i],
                      agent.discount_rate, self.optimizer, stats,
                      AnnealingVariable(.7, 1e-20, 10000), i))
            threads.append(thread)

        # starts the threads
        for t in threads:
            print("STARTING")
            t.start()
            time.sleep(0.5)
        try:
            [t.join() for t in threads]  # wait for threads to finish
        except KeyboardInterrupt:
            print("Exiting threads!")

    def save_weights(self):
        print("Saving Weights")
        self.global_network.save_weights("A3CMarioWeights.h5")

    def restore_weights(self):
        print("Restoring Weights!")
        self.global_network.load_weights("A3CMarioWeights.h5")
Ejemplo n.º 2
0
    def __init__(self):
        self.num_actions = NUM_ACTIONS  # number of actions
        self.starter_lr = 1e-4  # start value of learning rate

        # optimizer that trains the global network with the gradients of the locals
        # use locking because multiple threads
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr, use_locking=True)
        # the global Actor-Critic network
        self.global_network = Actor_Critic(self.num_actions)
        # prepare the global network - used to construct the network on eager execution
        self.global_network(tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32))
        self.restore_weights()
Ejemplo n.º 3
0
class A3CAgent(object):
    def __init__(self):
        self.num_actions = NUM_ACTIONS  # number of actions
        self.starter_lr = 1e-4  # start value of learning rate

        # optimizer that trains the global network with the gradients of the locals
        # use locking because multiple threads
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr,
                                                use_locking=True)
        # the global Actor-Critic network
        self.global_network = Actor_Critic(self.num_actions)
        # prepare the global network - used to construct the network on eager execution
        self.global_network(
            tf.convert_to_tensor(np.random.random((1, 84, 84, 4)),
                                 dtype=tf.float32))
        self.restore_weights()

    def pick_action(self, state, exploration_rate=0.0):
        if np.random.random() < exploration_rate:
            return test_env.action_space.sample()  # pick randomly

        state = np.expand_dims(state, axis=0)
        logits, _ = self.global_network(state)
        probs = tf.nn.softmax(logits)
        action = np.random.choice(self.num_actions, 1, p=probs.numpy()[0])
        return action[0]

    def play(self, env, stats, episodes: int = 100, exploration_rate=0.0):
        rewards_arr = np.zeros(episodes)
        for episode in range(episodes):
            episode_reward = 0
            done = False
            state = env.reset()
            while not done:
                env.render()
                # time.sleep(0.05)
                action = self.pick_action(state, exploration_rate)
                next_state, reward, done, _ = env.step(action)
                episode_reward += reward
                state = next_state
            if callable(stats):
                stats(self, episode_reward)
            rewards_arr[episode] = episode_reward
            print(episode_reward)
        stats.save_stats()
        return rewards_arr

    def restore_weights(self):
        self.global_network.load_weights('A3CPong.h5')
    def __init__(self):
        self.num_state = OBS_SPACE # observation size
        self.num_actions = NUM_ACTIONS # number of actions
        self.lr = tf.Variable(3e-4) # variable used for decaying learning rate
        self.starter_lr = 1e-4 # start value of learning rate

        # optimizer that trains the global network with the gradients of the locals
        # use locking because multiple threads
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr, use_locking=True)

        # the global Actor-Critic network
        self.global_network = Actor_Critic(self.num_actions)
        # prepare the global network - used to construct the network on eager execution
        self.global_network(tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32))

        self.discount_rate = 0.99
def train_thread(agent, max_eps, env, discount_rate, optimizer,
                 statistics: Stats, exploration_rate: AnnealingVariable,
                 number):

    # create local network and init its weights equal to the global
    local_network = Actor_Critic(env.action_space.n)
    # prepare it (must do this when eager execution is enabled)
    local_network(
        tf.convert_to_tensor(np.random.random((1, 84, 84, 4)),
                             dtype=tf.float32))
    local_network.set_weights(agent.global_network.get_weights())
    # lr_decay_anneal = AnnealingVariable(1e-4, 1e-24, 10e6)
    global episodes  # number of total episodes done for all threads

    # local lists for states, rewards and actions
    states, rewards, actions = [], [], []
    while episodes < max_eps:
        r_per_episode = 0.0
        done = False
        step = 0
        state = env.reset()
        # still training
        while not done and episodes < max_eps:
            exploration_rate.step()  # decay the exploration rate

            states.append(
                state)  # add the observation/state into the state list
            # find acction to pick according to the probs network and the exploration rate
            action = pick_action(env, local_network, state,
                                 exploration_rate.value)
            # do the action and observe the next state, reward and if the episode is over
            next_state, reward, done, _ = env.step(action)
            # lr_decay_anneal.step()

            # append the reward experienced in the reward list
            rewards.append(reward)
            # append action taken
            actions.append(action)
            r_per_episode += reward

            step += 1

            # if gathered enough experience or the episode is over -> train on experience gathered
            if step % train_frequency == 0 or done:
                # Gradient tape records the gradient during the evaluation of the loss function
                # -> eager execution MUST be enabled to work
                with tf.GradientTape() as tape:
                    # compute loss for each batch of experience
                    loss = compute_loss_from_batch(local_network, states,
                                                   rewards, actions, done,
                                                   next_state, discount_rate)
                # rewind the tape and get the gradients of the loss
                # for the weights of the local network (Actor-Critic)
                gradients = tape.gradient(loss,
                                          local_network.trainable_weights)
                # used because multiple threads
                lock.acquire()
                # agent.lr.assign(lr_decay_anneal.value)
                # apply the gradients found from the local network into the global network for the global weights
                optimizer.apply_gradients(
                    zip(gradients, agent.global_network.trainable_weights))
                # update local network with weights of global
                local_network.set_weights(agent.global_network.get_weights())
                lock.release()
                # empty state, reward, action list
                states, rewards, actions = [], [], []

            state = next_state
        with lock:
            # save stats
            if episodes < max_eps:
                episodes += 1
                statistics(agent, r_per_episode)
              actor_lr=3e-4,
              critic_lr=1e-3,
              train_actor_iters=80,
              train_critic_iters=80,
              lam=0.97,
              max_ep_len=1000,
              target_kl=0.01,
              seed=0)

np.random.seed(config['seed'])
env = gym.make(config['env'])
obs_space = env.observation_space
act_space = env.action_space
obs_size = obs_space.shape
act_size = act_space.shape
ac = Actor_Critic(obs_space, act_space)

local_steps_per_epoch = config['steps_per_epoch']
buf = PPO_Buffer(obs_size, act_size, local_steps_per_epoch, config['gamma'],
                 config['lam'])


def compute_loss_actor(data):
    obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
        'logp']
    pi, logp = ac.actor(obs, act)
    ratio = torch.exp(logp - logp_old)
    clip_adv = torch.clamp(ratio, 1 - config['clip_ratio'],
                           1 + config['clip_ratio']) * adv
    loss_actor = -(torch.min(ratio * adv, clip_adv)).mean()
Ejemplo n.º 7
0
def main(args):

    # create env
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # follow different logic depending on action space of env
    hidden_size = args.hidden_size

    if args.action_space == "continuous":
        # get env info
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space
        max_action = (env.action_space.high)
        min_action = (env.action_space.low)

        print("number of actions:{0}, dim of states: {1},\
          max_action: {2}, min_action: {3}"                                           .format(action_dim,\
                                                  state_dim,max_action,min_action))

        # create policy
        policy = Actor_Critic(state_dim, hidden_size,\
        action_dim, baseline = args.baseline)

    elif args.action_space == "discrete":
        # get env info
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n

        print("number of actions: {0}, dim of states: {1},\
        ".format(action_dim, state_dim))

        # create policy
        policy = Actor_Critic_discrete(state_dim, hidden_size,\
                                    action_dim, baseline = args.baseline)

    else:
        raise NotImplementedError

    # setup comet_ml to track experiments
    if os.path.isfile("settings.json"):
        with open('settings.json') as f:
            data = json.load(f)
        args.comet_apikey = data["apikey"]
        args.comet_username = data["username"]
    else:
        raise NotImplementedError
    experiment = Experiment(api_key=args.comet_apikey,\
    project_name="simple_policy_gradient",auto_output_logging="None",\
    workspace=args.comet_username,auto_metric_logging=False,\
    auto_param_logging=False)
    experiment.set_name(args.namestr)
    args.experiment = experiment

    # start of experiment: Keep looping until desired amount of episodes reached
    max_episodes = args.num_episodes
    total_episodes = 0  # keep track of amount of episodes that we have done

    while total_episodes < max_episodes:

        obs = env.reset()
        done = False
        trajectory = []  # trajectory info for reinforce update
        episode_reward = 0  # keep track of rewards per episode

        while not done:
            action, ln_prob = policy.select_action(np.array(obs))
            next_state, reward, done, _ = env.step(action)
            trajectory.append(
                [np.array(obs), action, ln_prob, reward, next_state, done])

            obs = next_state
            episode_reward += reward

        total_episodes += 1

        # update actor/policy and critic/value_network
        policy_loss, value_loss = policy.train(trajectory)
        experiment.log_metric("value function loss",
                              value_loss,
                              step=total_episodes)

        experiment.log_metric("policy loss", policy_loss, step=total_episodes)
        experiment.log_metric("episode reward",
                              episode_reward,
                              step=total_episodes)

        if total_episodes % 10 == 0:
            evaluate_policy(policy, env)

        env.close()
Ejemplo n.º 8
0
env = gym.envs.make(env_name)
MAX_ACTION = env.action_space.high
MIN_ACTION = env.action_space.low

ob_dim = env.observation_space.sample().shape[0]
ac_dim = env.action_space.sample().shape[0]

# MLP function approximators
pnet = MLP(2 * ac_dim, pnet_hparams)
vnet = MLP(1, vnet_hparams)

# actor and critic networks/training graphs in TF
actor = TF_CPolicy(pnet, ob_dim, ac_dim, hparams=actor_hparams,
                   min_val=MIN_ACTION, max_val=MAX_ACTION)
critic = TF_Value(vnet, ob_dim, hparams=critic_hparams)

# change structure of reward fn for car env
if env_name == "MountainCarContinuous-v0":
    def distance_reward(env, reward):
        return reward - np.abs(env.goal_position - env.state[0])
    reward_fn = distance_reward
else:
    reward_fn = None

# train and run actor critic
ac = Actor_Critic(env, actor, critic, hparams=ac_hparams,
                  reward_fn=reward_fn)
ac.train(video=False)
for _ in range(5):
    ac.do_episode(video=True)