Example #1
0
 def init(self):
     import tensorflow as tf
     self.env = self.env_producer.get_new_environment()
     self.s0 = self.env.reset()
     self.session = utils.create_session(self.env_opts, False)
     with tf.device("/cpu:0"):
         with tf.variable_scope("gather-%s" % self.idx):
             pol = get_policy(self.env_opts, self.session)
             self.agent = PPOAgent(pol, self.session,
                                   "gather-%s" % self.idx, self.env_opts)
             self.trainable_vars = tf.get_collection(
                 tf.GraphKeys.TRAINABLE_VARIABLES, "gather-%s" % self.idx)
             self.accum_vars = [
                 tf.Variable(tf.zeros_like(tv.initialized_value()),
                             trainable=False) for tv in self.trainable_vars
             ]
             assign_ops = [
                 self.trainable_vars[i].assign(self.accum_vars[i])
                 for i in range(len(self.trainable_vars))
             ]
             self.assign_op = tf.group(assign_ops)
         self.session.run(tf.global_variables_initializer())
         self.cur_hidden_state = self.agent.get_init_hidden_state()
         self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state
                                                    ], []
Example #2
0
def evaluate(args):
    env = gym.make(args.env)
    env_params = get_env_params(env, args)
    env.close()

    agent = PPOAgent(args, env_params)
    agent.load_model(load_model_remark=args.load_model_remark)

    parent_conn, child_conn = Pipe()
    worker = AtariEnvironment(args.env,
                              1,
                              child_conn,
                              is_render=True,
                              max_episode_step=args.max_episode_step)
    worker.start()

    for i_episode in range(100):
        obs = worker.reset()
        while True:
            obs = np.expand_dims(obs, axis=0)
            action = agent.choose_action(obs / 255)

            parent_conn.send(action[0])
            obs_, r, done, info = parent_conn.recv()

            obs = obs_

            if done:
                break
Example #3
0
def main():

    device = torch.device("cpu")
    env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = PPOAgent(state_size=state_size,
                     action_size=action_size,
                     hidden_size=256,
                     num_agents=num_agents,
                     random_seed=0,
                     ppo_epochs=4,
                     mini_batch_size=128,
                     normalize_advantages=True,
                     learning_rate=3e-4,
                     clip_gradients=True,
                     gamma=0.99,
                     tau=0.95,
                     device=device)
    agent.load_model('assets/ppo_checkpoint_37.10.pth')
    test_agent(env, brain_name, agent, device, real_time=True)
Example #4
0
 def load_policy(self, file_path):
     tf.reset_default_graph()
     with tf.Session() as session:
         with tf.variable_scope(MASTER_NAME) as scope:
             policy = get_policy(env_opts, session)
             master_agent = PPOAgent(policy, session, 'master-0', env_opts)
         saver = tf.train.Saver(max_to_keep=1)
         saver.restore(session, tf.train.latest_checkpoint(file_path))
Example #5
0
    def init_agent(self):
        import tensorflow as tf
        env_opts = environments.get_env_options(
            self.env_name, self.env_producer.get_use_gpu())
        self.session = utils.create_session(env_opts, True)
        with tf.variable_scope("worker-%s" % self.idx):
            pol = get_policy(env_opts, self.session)
            self.agent = PPOAgent(pol, self.session, "worker-%s" % self.idx,
                                  env_opts)
            self.trainable_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "worker-%s" % self.idx)
            self.accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in self.trainable_vars
            ]
            p_vars = self.agent.p_opt.variables()
            v_vars = self.agent.v_opt.variables()
            self.p_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in p_vars
            ]
            self.v_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in v_vars
            ]
            p_assign_ops = [
                p_vars[i].assign(self.p_opt_vars[i])
                for i in range(len(p_vars))
            ]
            v_assign_ops = [
                v_vars[i].assign(self.v_opt_vars[i])
                for i in range(len(v_vars))
            ]

            assign_ops = [
                self.trainable_vars[i].assign(self.accum_vars[i])
                for i in range(len(self.trainable_vars))
            ]
            self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops)

        self.session.run(tf.global_variables_initializer())
        self.run()
Example #6
0
def start(env):
    env = gym.make(env)
    frames = []
    MASTER_NAME = "master-0"
    IMAGE_PATH = "images/%s.gif" % env.spec.id
    tf.reset_default_graph()

    with tf.Session() as session:
        with tf.variable_scope(MASTER_NAME) as scope:
            env_opts = environments.get_env_options(env, False)
            policy = get_policy(env_opts, session)
            master_agent = PPOAgent(policy, session, MASTER_NAME, env_opts)

        saver = tf.train.Saver(max_to_keep=1)
        saver = tf.train.import_meta_graph(
            tf.train.latest_checkpoint("models/%s/" % env.spec.id) + ".meta")
        saver.restore(session,
                      tf.train.latest_checkpoint("models/%s/" % env.spec.id))
        try:
            pass
        except:
            print("Failed to restore model, starting from scratch")
            session.run(tf.global_variables_initializer())

        global_step = 0
        while global_step < 1000:
            terminal = False
            s0 = env.reset()
            cum_rew = 0
            cur_hidden_state = master_agent.get_init_hidden_state()
            episode_count = 0
            while not terminal:
                episode_count += 1
                frames.append(env.render(mode='rgb_array'))
                action, h_out = master_agent.get_strict_sample(
                    s0, cur_hidden_state)
                cur_hidden_state = h_out
                s0, r, terminal, _ = env.step(action)
                cum_rew += r
                global_step += 1
            print(episode_count, cum_rew)
        imageio.mimsave(IMAGE_PATH, frames, duration=1.0 / 60.0)
Example #7
0
def ppo(env, brain_name, policy, config, train):
    if train:
        optimizier = optim.Adam(
            policy.parameters(),
            config['hyperparameters']['adam_learning_rate'],
            eps=config['hyperparameters']['adam_epsilon'])
        agent = PPOAgent(env, brain_name, policy, optimizier, config)
        all_scores = []
        averages = []
        last_max = 30.0

        for i in tqdm.tqdm(range(config['hyperparameters']['episode_count'])):
            agent.step()
            last_mean_reward = play_round(env, brain_name, policy, config)
            if i == 0:
                last_average = last_mean_reward
            else:
                last_average = np.mean(np.array(
                    all_scores[-100:])) if len(all_scores) > 100 else np.mean(
                        np.array(all_scores))

            all_scores.append(last_mean_reward)
            averages.append(last_average)
            if last_average > last_max:
                torch.save(
                    policy.state_dict(),
                    f"reacher-ppo/models/ppo-max-hiddensize-{config['hyperparameters']['hidden_size']}.pth"
                )
                last_max = last_average
            clear_output(True)
            print(
                'Episode: {} Total score this episode: {} Last {} average: {}'.
                format(i + 1, last_mean_reward, min(i + 1, 100), last_average))
        return all_scores, averages
    else:
        all_scores = []
        for i in range(20):
            score = play_round(env, brain_name, policy, config, train)
            all_scores.append(score)
        return [score], [np.mean(score)]
Example #8
0
def start(env):
    env = gym.make(env)

    MASTER_NAME = "master-0"

    tf.reset_default_graph()

    with tf.Session() as session:
        with tf.variable_scope(MASTER_NAME) as scope:
            env_opts = environments.get_env_options(env, False)
            policy = get_policy(env_opts, session)
            master_agent = PPOAgent(policy, session, MASTER_NAME, env_opts)

        saver = tf.train.Saver(max_to_keep=1)
        saver = tf.train.import_meta_graph(tf.train.latest_checkpoint("models/%s/" % env.spec.id) + ".meta")
        saver.restore(session, tf.train.latest_checkpoint("models/%s/" % env.spec.id))
        try:
            pass
        except:
            print("Failed to restore model, starting from scratch")
            session.run(tf.global_variables_initializer())


        while True:
            terminal = False
            s0 = env.reset()
            cum_rew = 0
            cur_hidden_state = master_agent.get_init_hidden_state()
            episode_count = 0
            while not terminal:
                episode_count += 1
                env.render()
                action, h_out = master_agent.get_strict_sample(s0, cur_hidden_state)
                cur_hidden_state = h_out
                s0, r, terminal, _ = env.step(action)
                cum_rew += r
            print(episode_count, cum_rew)
Example #9
0
def main(args):

    model_store_sprefix = "snapshot"

    # NormalizedEnv
    env = gym.make(args.env)

    env.seed(args.seed)
    torch.manual_seed(args.seed)

    env, generator, model, cont = get_functions(env, args)

    optimizer = optim.Adam(model.parameters(), lr=args.rllr)

    memory = Memory(args)

    agent = PPOAgent(args, model, optimizer, env, generator, memory, cont)
    if args.resume:
        agent.load_model(model_store_sprefix)

    agent.train(model_store_sprefix, args.save_interval)
Example #10
0
    state_size = list(states[0][0].transpose(2, 0, 1).shape)
    state_size[0] *= NUM_CONSEQ_FRAMES

    # create policy
    policy = ActorCritic(state_size, action_size,
                         model_path=ckpt_path).to(device)

    trajectory_collector = TrajectoryCollector(
        env,
        policy,
        num_agents,
        is_visual=True,
        visual_state_size=NUM_CONSEQ_FRAMES,
        is_training=False)

    agent = PPOAgent(policy)

    state = trajectory_collector.last_states
    is_random_run = [0, 1, 2]

    for is_random in is_random_run:
        print(f"Staring {'' if is_random else 'non' } random run...")
        total_rewards = []
        avg_episode_length = 0
        episode_lengths = []
        for i_run in range(NUM_RUNS):
            sum_reward = 0
            ep = 0
            while True:
                ep += 1
                if is_random == 1:
Example #11
0
class GatheringWorker:
    def __init__(self, idx, env_producer, env_opts, rollout_size, worker_queue,
                 weights_queue):
        self.session = None
        self.idx = idx
        self.env_producer = env_producer
        self.env = None
        self.s0 = None
        self.trainable_vars = None
        self.agent = None
        self.env_opts = env_opts
        self.cur_hidden_state = None
        self.episode = None
        self.episodes = []
        self.batch_size = env_opts["batch_size"]
        self.terminal = False
        self.recurrent_policy = env_opts["recurrent"]
        self.timestep_size = env_opts["timestep_size"]
        if not self.recurrent_policy:
            self.timestep_size = 1
        self.discount_factor = env_opts["discount_factor"]
        self.gae_factor = env_opts["gae_factor"]
        self.max_episode_steps = env_opts["max_episode_steps"]
        self.rollout_size = rollout_size
        self.discrete_env = env_opts["discrete"]
        self.ep_count = 0
        self.episode_step = 0
        self.cum_rew = 0
        self.global_step = 0
        self.sampled_action = None
        self.sampled_a_prob = None
        self.accum_vars = None
        self.assign_op = None
        self.worker_queue = worker_queue
        self.weights_queue = weights_queue
        self.stats = []
        self.get_experience()

    def get_experience(self):
        self.init()
        action, a_prob, h_out, v_out = self.agent.get_sample(
            self.s0, self.cur_hidden_state)
        self.sampled_action = action
        self.sampled_a_prob = a_prob
        while True:
            self.stats = []
            self.apply_weights()
            self.episodes = []
            for i in range(self.rollout_size):
                if self.terminal:
                    if self.episode_step == self.max_episode_steps and len(
                            self.episode[1]) > 0:
                        self.episode[4][-1] = False
                    self.episode_step = 0
                    self.s0 = self.env.reset()
                    self.episodes.append(self.episode)
                    self.cur_hidden_state = self.agent.get_init_hidden_state()
                    self.episode = [self.s0
                                    ], [], [], [], [], [self.cur_hidden_state
                                                        ], []
                    self.stats.append({
                        "reward": self.cum_rew,
                        "step": self.ep_count,
                        "a_probs": self.sampled_a_prob,
                        "picked_a": self.sampled_action,
                        "a_dim": self.env_opts["action_dim"],
                        "discrete": self.env_opts["discrete"]
                    })
                    self.terminal = False
                    self.ep_count += 1
                    self.cum_rew = 0

                action, a_prob, h_out, v_out = self.agent.get_sample(
                    self.s0, self.cur_hidden_state)
                self.episode_step += 1
                self.global_step += 1
                if np.random.random() > 0.99:
                    self.sampled_action = action
                    self.sampled_a_prob = a_prob
                self.cur_hidden_state = h_out
                self.s0, r, self.terminal, _ = self.env.step(action)
                self.cum_rew += r
                self.episode[0].append(self.s0)
                self.episode[1].append(self.agent.transform_reward(r))
                self.episode[2].append(action)
                self.episode[3].append(a_prob)
                self.episode[4].append(self.terminal)
                self.episode[5].append(h_out)
                self.episode[6].append(v_out)
            self.episodes.append(self.episode)
            self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state
                                                       ], []
            result = self.process_episodes(self.episodes)
            self.worker_queue.put(result)

    def apply_weights(self):
        weights = self.weights_queue.get()
        feed_dict = {}
        for i, t in enumerate(self.accum_vars):
            feed_dict[t] = weights[i]
        self.session.run(self.assign_op, feed_dict=feed_dict)

    def init(self):
        import tensorflow as tf
        self.env = self.env_producer.get_new_environment()
        self.s0 = self.env.reset()
        self.session = utils.create_session(self.env_opts, False)
        with tf.device("/cpu:0"):
            with tf.variable_scope("gather-%s" % self.idx):
                pol = get_policy(self.env_opts, self.session)
                self.agent = PPOAgent(pol, self.session,
                                      "gather-%s" % self.idx, self.env_opts)
                self.trainable_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, "gather-%s" % self.idx)
                self.accum_vars = [
                    tf.Variable(tf.zeros_like(tv.initialized_value()),
                                trainable=False) for tv in self.trainable_vars
                ]
                assign_ops = [
                    self.trainable_vars[i].assign(self.accum_vars[i])
                    for i in range(len(self.trainable_vars))
                ]
                self.assign_op = tf.group(assign_ops)
            self.session.run(tf.global_variables_initializer())
            self.cur_hidden_state = self.agent.get_init_hidden_state()
            self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state
                                                       ], []

    def process_episodes(self, episodes):
        all_states = []
        all_advantages = []
        all_returns = []
        all_picked_actions = []
        all_old_actions_probs = []
        all_pred_values = []
        all_hidden_states = []

        for episode in episodes:
            st, rewards, picked_actions, old_action_probs, terminals, hidden_states, values = episode
            if len(rewards) == 0:
                continue
            states = np.asarray(st)
            pred_values = np.zeros(len(values) + 1)
            pred_values[:-1] = np.array(values)
            episode_len = len(rewards)
            advantages = np.zeros((episode_len, ))
            returns = np.zeros((episode_len + 1, ))
            if terminals[-1]:
                pred_values[-1] = 0
            else:
                _, _, _, v_out = self.agent.get_sample(states[-1],
                                                       hidden_states[-1])
                pred_values[-1] = v_out
            returns[-1] = pred_values[-1]
            for i in reversed(range(episode_len)):
                r = rewards[i]
                next_v = pred_values[i + 1]
                cur_v = pred_values[i]
                diff = r + self.discount_factor * next_v - cur_v
                if i == episode_len - 1:
                    advantages[i] = diff
                else:
                    advantages[
                        i] = diff + self.discount_factor * self.gae_factor * advantages[
                            i + 1]
                returns[i] = r + self.discount_factor * returns[i + 1]
            returns = returns[:-1]

            ep_states = states[:-1]
            ep_advantages = advantages
            ep_returns = returns
            ep_picked_actions = np.array(picked_actions)
            ep_old_action_probs = np.array(old_action_probs)
            ep_all_pred_values = pred_values
            ep_hidden_state = np.array(hidden_states[:-1])
            splitted = utils.split_episode(ep_states, ep_advantages,
                                           ep_returns, ep_picked_actions,
                                           ep_old_action_probs,
                                           ep_all_pred_values, ep_hidden_state,
                                           self.timestep_size)
            for b_states, b_hidden_state, b_advantages, b_returns, b_picked_actions, b_old_action_probs, b_all_pred_values in splitted:
                all_states.append(b_states)
                all_advantages.append(b_advantages)
                all_returns.append(b_returns)
                all_picked_actions.append(b_picked_actions)
                all_old_actions_probs.append(b_old_action_probs)
                all_pred_values.append(b_all_pred_values)
                all_hidden_states.append(b_hidden_state)

        all_states = np.array(all_states)
        all_advantages = np.array(all_advantages)
        all_picked_actions = np.array(all_picked_actions)
        all_returns = np.array(all_returns)
        all_old_actions_probs = np.array(all_old_actions_probs)
        all_pred_values = np.array(all_pred_values)
        all_hidden_states = np.array(all_hidden_states)

        return [
            all_states, all_advantages, all_picked_actions, all_returns,
            all_old_actions_probs, all_pred_values, all_hidden_states,
            self.ep_count, self.stats, self.idx
        ]
Example #12
0
    state_size = list(states[0][0].transpose(2, 0, 1).shape)
    state_size[0] *= NUM_CONSEQ_FRAMES
    
    # torch.manual_seed(SEED)
    # np.random.seed(SEED)

    # create policy to be trained & optimizer
    policy = ActorCritic(state_size, action_size).to(device)

    writer = tensorboardX.SummaryWriter(comment=f"-ejik")
    
    trajectory_collector = TrajectoryCollector(env, policy, num_agents, tmax=TMAX, gamma=GAMMA, gae_lambda=GAE_LAMBDA, debug=debug, is_visual=True, visual_state_size=NUM_CONSEQ_FRAMES)

    tb_tracker = TBMeanTracker(writer, EPOCHS)

    agent = PPOAgent(policy, tb_tracker, LR, EPSILON, BETA)
    
    #scheduler = lr_scheduler.LambdaLR(agent.optimizer, lambda ep: 0.1 if ep == STEP_DECAY else 1)
    scheduler = lr_scheduler.StepLR(agent.optimizer, step_size=STEP_DECAY, gamma=GAMMA)
    n_episodes = 0
    max_score = - np.Inf

    traj_attributes = ["states", "actions", "log_probs", "advantages", "returns"]
    solved = False
    start = None
    step = 0

    with RewardTracker(writer, mean_window=AVG_WIN, print_every=AVG_WIN // 2) as reward_tracker:
        d = datetime.datetime.today()

        print(f"Started training run: at {d.strftime('%d-%m-%Y %H:%M:%S')}")
Example #13
0
class Worker:
    def __init__(self, env_producer, idx, env_opts, num_gather_workers,
                 master_weights_in_queue, master_weights_out_queue):
        self.env_opts = env_opts
        self.num_gather_workers = num_gather_workers
        self.env_producer = env_producer
        self.batch_size = env_opts["batch_size"]
        self.clip_eps = env_opts["clip_eps"]
        self.grad_step = env_opts["grad_step"]
        self.epochs = env_opts["epochs"]
        self.entropy_coef = env_opts["entropy_coef"]
        self.state_dim = env_opts["state_dim"]
        self.idx = idx
        self.session = None
        self.episode_step = 0
        self.initialized = False
        self.beta = self.env_opts["init_beta"]
        self.eta = self.env_opts["eta"]
        self.kl_target = self.env_opts["kl_target"]
        self.use_kl_loss = self.env_opts["use_kl_loss"]
        self.lr_multiplier = 1.0
        self.prev_batch = None
        self.variables_file_path = "models/%s/variables.txt" % env_opts[
            "env_name"]
        self.worker_queue = Queue()
        self.weights_queues = [Queue() for _ in range(self.num_gather_workers)]
        self.master_weights_in_queue = master_weights_in_queue
        self.master_weights_out_queue = master_weights_out_queue
        self.init_workers()
        self.agent = None
        self.trainable_vars = None
        self.accum_vars = None
        self.assign_op = None
        self.p_opt_vars = None
        self.v_opt_vars = None
        self.init_agent()

    def init_agent(self):
        import tensorflow as tf
        self.session = utils.create_session(self.env_opts, True)
        with tf.variable_scope("worker-%s" % self.idx):
            pol = get_policy(self.env_opts, self.session)
            self.agent = PPOAgent(pol, self.session, "worker-%s" % self.idx,
                                  self.env_opts)
            self.trainable_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "worker-%s" % self.idx)
            self.accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in self.trainable_vars
            ]
            p_vars = self.agent.p_opt.variables()
            v_vars = self.agent.v_opt.variables()
            self.p_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in p_vars
            ]
            self.v_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in v_vars
            ]
            p_assign_ops = [
                p_vars[i].assign(self.p_opt_vars[i])
                for i in range(len(p_vars))
            ]
            v_assign_ops = [
                v_vars[i].assign(self.v_opt_vars[i])
                for i in range(len(v_vars))
            ]

            assign_ops = [
                self.trainable_vars[i].assign(self.accum_vars[i])
                for i in range(len(self.trainable_vars))
            ]
            self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops)

        self.session.run(tf.global_variables_initializer())
        self.run()

    def init_workers(self):
        for i in range(self.num_gather_workers):
            rollout_size = self.env_opts[
                "rollout_size"] // self.num_gather_workers
            t = Process(target=make_worker,
                        args=(i, self.env_producer, self.env_opts,
                              self.worker_queue, self.weights_queues[i],
                              rollout_size))
            t.start()

    def run(self):
        while True:
            self.apply_shared_variables()
            self.apply_weights_to_gather_workers()
            stats = self.compute_grads_and_stats()
            self.send_to_master(stats)

    def send_to_master(self, stats):
        weights, p_opt_weights, v_opt_weights = self.session.run([
            self.trainable_vars,
            self.agent.p_opt.variables(),
            self.agent.v_opt.variables()
        ])
        arr = [
            self.beta, self.lr_multiplier, p_opt_weights, v_opt_weights,
            weights, stats
        ]
        self.master_weights_out_queue.put(arr)

    def apply_weights_to_gather_workers(self):
        weights = self.session.run(self.trainable_vars)
        for q in self.weights_queues:
            q.put(weights)

    def apply_shared_variables(self):
        beta, lr_multiplier, p_opt_weights, v_opt_weights, weights = self.master_weights_in_queue.get(
        )
        self.beta = beta
        self.lr_multiplier = lr_multiplier
        fd = {}
        for i, t in enumerate(self.accum_vars):
            fd[t] = weights[i]
        for i, t in enumerate(self.p_opt_vars):
            fd[t] = p_opt_weights[i]
        for i, t in enumerate(self.v_opt_vars):
            fd[t] = v_opt_weights[i]
        self.session.run(self.assign_op, feed_dict=fd)

    def compute_grads_and_stats(self):
        results = []
        for i in range(self.num_gather_workers):
            results.append(self.worker_queue.get())
        w_idx = list(range(self.num_gather_workers))
        cur_all_states = np.concatenate([results[i][0] for i in w_idx], axis=0)
        cur_all_advantages = np.concatenate([results[i][1] for i in w_idx],
                                            axis=0)
        cur_all_picked_actions = np.concatenate([results[i][2] for i in w_idx],
                                                axis=0)
        cur_all_returns = np.concatenate([results[i][3] for i in w_idx],
                                         axis=0)
        cur_all_old_actions_probs = np.concatenate(
            [results[i][4] for i in w_idx], axis=0)
        cur_all_pred_values = np.concatenate([results[i][5] for i in w_idx],
                                             axis=0)
        cur_all_hidden_states = np.concatenate([results[i][6] for i in w_idx],
                                               axis=0)

        if self.prev_batch is not None:
            prev_all_states, prev_all_advantages, prev_all_picked_actions, prev_all_returns, \
                prev_all_old_actions_probs, prev_all_pred_values, prev_all_hidden_states = self.prev_batch
            all_states = np.concatenate([cur_all_states, prev_all_states],
                                        axis=0)
            all_advantages = np.concatenate(
                [cur_all_advantages, prev_all_advantages], axis=0)
            all_picked_actions = np.concatenate(
                [cur_all_picked_actions, prev_all_picked_actions], axis=0)
            all_returns = np.concatenate([cur_all_returns, prev_all_returns],
                                         axis=0)
            all_old_actions_probs = np.concatenate(
                [cur_all_old_actions_probs, prev_all_old_actions_probs],
                axis=0)
            all_pred_values = np.concatenate(
                [cur_all_pred_values, prev_all_pred_values], axis=0)
            all_hidden_states = np.concatenate(
                [cur_all_hidden_states, prev_all_hidden_states], axis=0)
        else:
            all_states = cur_all_states
            all_advantages = cur_all_advantages
            all_picked_actions = cur_all_picked_actions
            all_returns = cur_all_returns
            all_old_actions_probs = cur_all_old_actions_probs
            all_pred_values = cur_all_pred_values
            all_hidden_states = cur_all_hidden_states

        self.prev_batch = [
            cur_all_states, cur_all_advantages, cur_all_picked_actions,
            cur_all_returns, cur_all_old_actions_probs, cur_all_pred_values,
            cur_all_hidden_states
        ]

        all_advantages = (all_advantages - all_advantages.mean()) / (max(
            all_advantages.std(), 1e-4))

        first_gather = [x for x in results if x[9] == 0][0]

        self.episode_step = first_gather[7]
        stats = first_gather[8]

        sz = len(all_states)
        n_batches = (sz - 1) // self.batch_size + 1
        steps = 0
        cur_kl = 0
        entropy = 0
        hinge = 0
        src_policy_loss = 0
        vloss = 0
        ploss = 0
        for cur_epoch in range(self.epochs):
            idx = np.arange(len(all_states))
            np.random.shuffle(idx)
            all_states = all_states[idx]
            all_returns = all_returns[idx]
            all_picked_actions = all_picked_actions[idx]
            all_old_actions_probs = all_old_actions_probs[idx]
            all_advantages = all_advantages[idx]
            all_pred_values = all_pred_values[idx]
            all_hidden_states = all_hidden_states[idx]
            for b in range(n_batches):
                start = b * self.batch_size
                end = min(sz, (b + 1) * self.batch_size)
                states_b = all_states[start:end]
                returns_b = all_returns[start:end]
                picked_actions_b = all_picked_actions[start:end]
                old_action_probs_b = all_old_actions_probs[start:end]
                advantages_b = all_advantages[start:end]
                hidden_states_b = all_hidden_states[start:end]
                old_values_b = all_pred_values[start:end]
                cur_kl, entropy, hinge, src_policy_loss, vloss, ploss = \
                    self.agent.train(states_b,
                                     advantages_b,
                                     returns_b,
                                     picked_actions_b,
                                     old_action_probs_b,
                                     hidden_states_b,
                                     old_values_b,
                                     self.clip_eps,
                                     self.beta,
                                     self.eta,
                                     self.grad_step * self.lr_multiplier)
                steps += 1
            if cur_kl > self.kl_target * 4 and self.use_kl_loss:
                break

        if self.use_kl_loss:
            if cur_kl > self.kl_target * 2:
                self.beta = np.minimum(35, 1.5 * self.beta)
                if self.beta > 30.0:
                    self.lr_multiplier /= 1.5
            elif cur_kl < self.kl_target / 2:
                self.beta = np.maximum(1 / 35, self.beta / 1.5)
                if self.beta <= 1 / 30.0:
                    self.lr_multiplier *= 1.5
            self.lr_multiplier = max(min(self.lr_multiplier, 3.0), 0.1)

        train_stats = {
            "stats": stats,
            "kl": cur_kl,
            "entropy": entropy,
            "hinge": hinge,
            "src_policy_loss": src_policy_loss,
            "vloss": vloss,
            "ploss": ploss,
            "lr_multiplier": self.lr_multiplier,
            "beta": self.beta,
            "step": self.episode_step,
            "idx": self.idx
        }
        return train_stats
Example #14
0
config.gradient_clip = 5
config.rollout_length = 20 * 512
config.optimization_epochs = 10
config.num_mini_batches = 512
config.ppo_ratio_clip = 0.2
config.log_interval = 3 * 200 * 512
config.max_steps = 2e7
config.eval_episodes = 10
# config.logger = get_logger()

select_device(0)

print("GPU available: {}".format(torch.cuda.is_available()))
print("GPU tensor test: {}".format(torch.rand(3, 3).cuda()))

agent = PPOAgent(config)

random_seed()
config = agent.config

agent.actor_critic.load_state_dict(
    torch.load('../checkpoints/ppo_checkpoint.pth'))

score = 0  # initialize the score

for i in range(3):
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations
    for j in range(2000):
        action = agent.act(state)
        env_info = env.step(action.cpu().detach().numpy())[brain_name]
Example #15
0
def train(args):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    envs = MultiprocessEnvironment.create_mario_env(num_envs=args.jobs,
                                                    world=args.world,
                                                    stage=args.stage)
    actor_critic = RecurrentPolicy(state_frame_channels=envs.observation_shape[0],
                                   action_space_size=envs.action_space_size,
                                   hidden_layer_size=args.hidden_size,
                                   prev_actions_out_size=args.prev_actions_hidden_size,
                                   recurrent_hidden_size=args.recurrent_hidden_size,
                                   device=device)
    experience = ExperienceStorage(num_steps=args.steps_per_update,
                                   num_envs=args.jobs,
                                   observation_shape=envs.observation_shape,
                                   recurrent_hidden_size=args.recurrent_hidden_size,
                                   device=device)

    initial_observations = envs.reset()
    experience.insert_initial_observations(initial_observations)

    tb_writer = SummaryWriter()

    num_updates = args.steps // (args.jobs * args.steps_per_update)
    agent = PPOAgent(actor_critic,
                     lr=args.lr,
                     lr_lambda=lambda step: 1 - (step / float(num_updates)),
                     policy_loss_coef=args.policy_loss_coef,
                     value_loss_coef=args.value_loss_coef,
                     entropy_loss_coef=args.entropy_loss_coef,
                     max_grad_norm=args.max_grad_norm,
                     clip_threshold=args.ppo_clip_threshold,
                     epochs=args.ppo_epochs,
                     minibatches=args.ppo_minibatches)

    for update_step in tqdm(range(num_updates)):
        episode_rewards = []
        for step in range(args.steps_per_update):
            with torch.no_grad():
                actor_input = experience.get_actor_input(step)
                (values,
                 actions,
                 action_log_probs,
                 _,  # Action disribution entropy is not needed.
                 recurrent_hidden_states) = actor_critic.act(*actor_input)

            observations, rewards, done_values, info_dicts = envs.step(actions)
            masks = 1 - done_values
            experience.insert(observations,
                              actions,
                              action_log_probs,
                              rewards,
                              values,
                              masks,
                              recurrent_hidden_states)

            for done, info in zip(done_values, info_dicts):
                if done:
                    level_completed_percentage = info['x_pos'] / MAX_X
                    episode_rewards.append(level_completed_percentage)

        with torch.no_grad():
            critic_input = experience.get_critic_input()
            next_value = actor_critic.value(*critic_input)

        experience.compute_gae_returns(next_value,
                                       gamma=args.discount,
                                       gae_lambda=args.gae_lambda)

        losses = agent.update(experience)

        if episode_rewards:
            with torch.no_grad():
                cumulative_reward = experience.rewards.sum((0, 2))
                mean_reward = cumulative_reward.mean()
                std_reward = cumulative_reward.std()

            tb_writer.add_scalar('mario/lr', agent.current_lr(), update_step)
            tb_writer.add_scalars('mario/level_progress', {
                'min': np.min(episode_rewards),
                'max': np.max(episode_rewards),
                'mean': np.mean(episode_rewards),
                'median': np.median(episode_rewards),
            }, update_step)

            tb_writer.add_scalars('mario/reward', {'mean': mean_reward,
                                                   'std': std_reward}, update_step)
            tb_writer.add_scalars('mario/loss', {
                'policy': losses['policy_loss'],
                'value': losses['value_loss'],
            }, update_step)
            tb_writer.add_scalar('mario/action_dist_entropy',
                                 losses['action_dist_entropy'],
                                 update_step)

            if np.min(episode_rewards) == 1.0:
                model_path = 'models/super_model_{}.bin'.format(update_step + 1)
                torch.save(actor_critic.state_dict(), model_path)

        save_model = (update_step % args.save_interval) == (args.save_interval - 1)
        if save_model:
            model_path = 'models/model_{}.bin'.format(update_step + 1)
            torch.save(actor_critic.state_dict(), model_path)

    tb_writer.close()
Example #16
0
    def start(self):
        import tensorflow as tf
        self.summary_writer = tf.summary.FileWriter("logs/%s" %
                                                    self.env_opts["env_name"])
        self.session = utils.create_session(self.env_opts, True)
        with tf.variable_scope("master-0"):
            pol = get_policy(self.env_opts, self.session)
            self.agent = PPOAgent(pol, self.session, "master-0", self.env_opts)
            self.trainable_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "master-0")
            self.accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in self.trainable_vars
            ]
            p_vars = self.agent.p_opt.variables()
            v_vars = self.agent.v_opt.variables()
            self.p_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in p_vars
            ]
            self.v_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in v_vars
            ]
            p_assign_ops = [
                p_vars[i].assign(self.p_opt_vars[i])
                for i in range(len(p_vars))
            ]
            v_assign_ops = [
                v_vars[i].assign(self.v_opt_vars[i])
                for i in range(len(v_vars))
            ]

            assign_ops = [
                self.trainable_vars[i].assign(self.accum_vars[i])
                for i in range(len(self.trainable_vars))
            ]
            self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops)

        self.restore_variables()
        self.saver = tf.train.Saver(max_to_keep=1)
        self.session.run(tf.global_variables_initializer())
        try:
            self.saver = tf.train.import_meta_graph(
                tf.train.latest_checkpoint("models/%s/" %
                                           self.env_opts["env_name"]) +
                ".meta")
            self.saver.restore(
                self.session,
                tf.train.latest_checkpoint("models/%s/" %
                                           self.env_opts["env_name"]))
        except:
            print("failed to restore model")

        while True:
            if self.iter_count % 10 == 0:
                print("Saving model...")
                self.save_variables()
                self.saver.save(self.session, self.model_path, self.iter_count)
                print("Model saved")
            self.broadcast_weights()
            self.merge_weights()
            self.iter_count += 1
Example #17
0
def ppo():
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64",
                           no_graphics=True)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    config = Config()
    config.env = env

    config.actor_critic_fn = lambda: ActorCritic(
        actor=Actor(state_size, action_size), critic=Critic(state_size))

    config.discount = 0.99
    config.use_gae = True
    config.gae_tau = 0.95
    config.gradient_clip = 5
    config.rollout_length = 2048
    config.optimization_epochs = 5
    config.num_mini_batches = 512
    config.ppo_ratio_clip = 0.2
    config.log_interval = 10 * 2048
    config.max_steps = 2e7
    config.eval_episodes = 10
    # config.logger = get_logger()

    print("GPU available: {}".format(torch.cuda.is_available()))
    print("GPU tensor test: {}".format(torch.rand(3, 3).cuda()))

    agent = PPOAgent(config)

    random_seed()
    config = agent.config
    t0 = time.time()
    scores = []
    scores_window = deque(maxlen=100)  # last 100 scores

    while True:
        if config.log_interval and not agent.total_steps % config.log_interval and len(
                agent.episode_rewards):
            rewards = agent.episode_rewards
            for reward in rewards:
                scores.append(reward)
                scores_window.append(reward)
            agent.episode_rewards = []

            print('\r===> Average Score: {:d} episodes {:.2f}'.format(
                len(scores), np.mean(scores_window)))
            if np.mean(scores_window) >= 1.0:
                print(
                    '\nEnvironment solved in {:d}  episodes!\tAverage Score: {:.2f}'
                    .format(len(scores_window), np.mean(scores_window)))
                torch.save(agent.actor_critic.state_dict(),
                           '../checkpoints/ppo_checkpoint.pth')
                break

            print(
                'Total steps %d, returns %d/%.2f/%.2f/%.2f/%.2f (count/mean/median/min/max), %.2f steps/s'
                % (agent.total_steps, len(rewards), np.mean(rewards),
                   np.median(rewards), np.min(rewards), np.max(rewards),
                   config.log_interval / (time.time() - t0)))

            t0 = time.time()

        agent.step()

    return scores
Example #18
0
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 13 16:49:00 2022

@author: nbrow
"""
from agent import PPOAgent
import gym
import numpy as np

import os
from UC_Env import UC_Env
#tf.set_random_seed(0)
if __name__ == "__main__":
    # newest gym fixed bugs in 'BipedalWalker-v2' and now it's called 'BipedalWalker-v3'
    env = UC_Env()
    agent = PPOAgent(env)
    agent.run_batch()  # train as PPO
    #agent.run_multiprocesses(num_worker = 16)  # train PPO multiprocessed (fastest)
    #agent.test()
Example #19
0
                        default=False)
    parser.add_argument('-threshold_score', type=int, default=200)
    parser.add_argument('-best_avg_reward', type=int, default=-200)
    parser.add_argument('-test_env', type=bool, default=False)
    args = parser.parse_args()

    env = gym.make(args.env)
    envs = SubprocVecEnv([make_env(args.env) for i in range(args.n_envs)])

    n_inputs = envs.observation_space.shape[0]
    n_outs = envs.action_space.n

    agent = PPOAgent(lr=args.lr,
                     n_inputs=n_inputs,
                     n_hidden=args.n_hidden,
                     n_outs=n_outs,
                     td_n=args.td_n,
                     ppo_epochs=args.ppo_epochs,
                     mini_batch_size=args.mini_batch_size)
    if args.load_best_pretrained_model:
        agent.load_model('../models/ppo/model.pt')
        print('Loaded pretrained model')

    if args.test_env:
        state = env.reset()
        done = False
        score = 0
        while not done:
            env.render()
            dist, value = agent.step(state)
def experiment(hidden_size=64,
               lr=3e-4,
               num_steps=2048,
               mini_batch_size=32,
               ppo_epochs=10,
               threshold_reward=10,
               max_episodes=15,
               nrmlz_adv=True,
               gamma=0.99,
               tau=0.95,
               clip_gradients=True):
    '''

    :param hidden_size: number of neurons for the layers of the model
    :param lr: learning rate
    :param num_steps: maximum duration of one epoch
    :param mini_batch_size: mini batch size for ppo
    :param ppo_epochs: number of epochs for ppo to learn
    :param threshold_reward: what is the goal of the training
    :param max_episodes: maximum duration of the training
    :param nrmlz_adv: True, if advantages should be normalized before PPO
    :param clip_gradients: True if gradients should ne clipped after PPO
    :return: list of scores and list of test_rewards
    '''

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    scores_window = deque(maxlen=100)
    test_rewards = []
    moving_averages = []

    env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = PPOAgent(learning_rate=lr,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     num_agents=num_agents,
                     random_seed=0,
                     ppo_epochs=ppo_epochs,
                     mini_batch_size=mini_batch_size,
                     normalize_advantages=nrmlz_adv,
                     clip_gradients=clip_gradients,
                     gamma=gamma,
                     tau=tau,
                     device=device)

    #    while episode < max_episodes and not early_stop:
    for episode in tqdm(range(max_episodes)):
        log_probs = []
        values = []
        states_list = []
        actions_list = []
        rewards = []
        masks = []
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        for duration in range(num_steps):

            state = torch.FloatTensor(state).to(device)
            action, value, log_prob = agent.act(state)
            env_info = env.step(action.cpu().data.numpy())[
                brain_name]  # send all actions to the environment

            next_state = env_info.vector_observations  # get next state (for each agent)
            reward = env_info.rewards  # get reward (for each agent)
            dones = np.array(env_info.local_done)  # see if episode finished
            if reward == None:
                pass

            log_probs.append(log_prob)
            values.append(value)
            reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device)
            masks_t = torch.FloatTensor(1 - dones)
            rewards.append(reward_t)
            masks.append(masks_t)
            states_list.append(state)
            actions_list.append(action)

            state = next_state

            if np.any(dones):
                break

        next_state = torch.FloatTensor(state).to(device)
        _, next_value, _ = agent.act(next_state)
        agent.step(states=states_list,
                   actions=actions_list,
                   values=values,
                   log_probs=log_probs,
                   rewards=rewards,
                   masks=masks,
                   next_value=next_value)

        test_mean_reward = test_agent(env, brain_name, agent, device)
        test_rewards.append(test_mean_reward)
        scores_window.append(test_mean_reward)
        moving_averages.append(np.mean(scores_window))
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.
              format(episode, test_mean_reward, min(episode, 100),
                     np.mean(scores_window)))
        if np.mean(scores_window) > threshold_reward:
            agent.save_model(
                f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth"
            )
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, test_mean_reward))
            break

        episode += 1
    env.close()
    return scores_window, test_rewards, moving_averages
Example #21
0
def main():
    mujoco = True
    render = True
    save_models = False # Save the models 
    training_mode = False # Train the agent or test a memory model
    reward_threshold = None 
    # reward_threshold = 290 

    update_threshold = 800 # Iterations before update the Policy
    plot_batch_threshold = 500 # Espisodes included in the partial plot
    episode_max = 30000 

    # update_threshold = 1000 # Iterations before update the Policy
    # plot_batch_threshold = 100 # Episodes included in the partial plot
    # episode_max = 3000 

    if mujoco:
        env_name = 'Humanoid-v2'
        epsilon_discount = 5.0e-3
    else:
        env_name = 'MountainCarContinuous-v0'
        epsilon_discount = 4.0e-4

    env = gym.make(env_name)
    check_folder(env_name)
    env.seed(69)
    np.random.seed(69)
    tf.random.set_seed(69)
    
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    epsilon = 0.9
    agent = PPOAgent(state_dim, action_dim, env, epsilon, mujoco)  

    if not training_mode:
        path = 'test_models/'+env_name
        agent.load_models(path)
    
    rewards = []   
    rewards_means = []   
    batch_rewards = []
    batch_solved_reward = []
    
    times = []
    batch_times = []

    updates_counter = 0
    
    tb_writer = agent.get_summary_writer()
    rewards_metric = tf.keras.metrics.Mean(name='rewards_metric')

    for epis in range(1, episode_max + 1):
        try:
            total_reward, time, updates_counter = run_episode(env, agent, state_dim, render, training_mode, updates_counter, update_threshold)
            print('Episode {} Elapsed time: {} Total reward: {}  Epsilon: {}'.format(epis, time, int(total_reward), agent.get_epsilon()))
            batch_rewards.append(int(total_reward))
            batch_times.append(time)   
            epsilon -= epsilon_discount

            rewards_metric(total_reward)
            with tb_writer.as_default():
                tf.summary.scalar('rewards', rewards_metric.result(), step=epis)
            rewards_metric.reset_states()
            
            if epsilon >= 0.2 and training_mode:
                agent.set_epsilon(epsilon)
            if save_models:
                agent.save_models(epis,'')  
                
            if epis % plot_batch_threshold == 0:
                print('=====================')
                print('|-------Batch-------|')
                print('=====================')
                plot(env_name,batch_rewards,"+",'Rewards of batch until episode {}'.format(epis), 'Episodes','Rewards',str(epis)+'_Batch')
                plot(env_name,batch_times,".",'Times of batch until episode {}'.format(epis), 'Episodes','Times',str(epis)+'_Batch')
                
                rewards_mean = np.mean(batch_rewards)
                print('Max Reward:', np.max(batch_rewards))
                print('Min Reward:', np.min(batch_rewards))
                print('Avg Reward:', rewards_mean)
                print('')

                rewards = rewards + batch_rewards
                times = times = batch_times
                rewards_means.append(rewards_mean)
                    
                batch_rewards = []
                batch_times = []

                print('============================')
                print('|-------Accumulative-------|')
                print('============================')
                plot(env_name,rewards,"+",'Total rewards until episode {}'.format(epis), 'Episodes','Rewards',str(epis)+'_Total')
                plot(env_name,times,".",'Total times until episode {}'.format(epis), 'Episodes','Times',str(epis)+'_Total')

            if reward_threshold:
                if len(batch_solved_reward) == 100:            
                    if np.mean(batch_solved_reward) >= reward_threshold :              
                        rewards = rewards + batch_rewards
                        times = times = batch_times                    

                        print('============================')
                        print('Reward threshold reached after {} episodes'.format(epis))
                        print('============================')
                        agent.save_models(epis,'solved')  
                        break
                    else:
                        del batch_solved_reward[0]
                        batch_solved_reward.append(total_reward)
                else:
                    batch_solved_reward.append(total_reward)
        
        except KeyboardInterrupt:
            print('Training loop interrupted, saving last models . . .')
            agent.save_models(epis,'forced') 
            plot(env_name,rewards_means,"ro-",'Average reward per batch until episode {}'.format(epis), 'Batchs','Rewards',str(epis)+'_BatchAverage')
                
            exit() 
    agent.save_models(episode_max,'finalized')
    plot(env_name,rewards_means,"ro-",'Average reward per batch until episode {}'.format(epis), 'Batchs','Rewards',str(epis)+'_BatchAverage')
Example #22
0
args = get_args()
torch.set_default_tensor_type('torch.cuda.FloatTensor')

writer = SummaryWriter(log_dir='logs/circuit')

if os.name == 'nt':  # windows
    binary = os.path.join('cicuit2', 'circuit_2')
else:
    binary = 'circuit_linux/circuit_linux.x86_64'

env = UnityEnvironment(file_name=binary, worker_id=0)

print(str(env))
train_mode = True

agent = PPOAgent()
agent.model = agent.model.cuda()
load_weights = True
if (load_weights):
    agent.model.load_state_dict(
        torch.load("pretrained_weights/saved_model_ppo_epoch_23040"))

optimizer = torch.optim.Adam(agent.parameters(), lr=args.lr)

default_brain = env.brain_names[0]
brain = env.brains[default_brain]
config = {
    "WrongDirectionPenalty": 0.01,
    'PenaltyCarCollision': 1.0,
    'MaxAngleReward': 35,
    'TimePenalty': 0.015
Example #23
0
def paralle_train(args):
    logger = SummaryWriter(log_dir='results/{}_{}_{}'.format(
        args.env, args.seed,
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    env = gym.make(args.env)
    env_params = get_env_params(env, args)
    env.close()

    agent = PPOAgent(args, env_params)
    workers, parent_conns, children_conns = workers_initialize(args)

    obs = np.zeros(shape=[args.num_worker, 4, 84, 84], dtype=np.float32)

    #initialize obs_normalizer
    print('Start initialize obs normalizer....')
    next_obs_batch = []
    for step in range(args.initialize_episode * args.max_episode_step):
        actions = np.random.randint(0,
                                    env_params['a_dim'],
                                    size=(args.num_worker))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)
        for parent_conn in parent_conns:
            obs_, r, done, info = parent_conn.recv()
            next_obs_batch.append(obs_)

        if len(next_obs_batch) % (10 * args.num_worker) == 0:
            next_obs_batch = np.stack(next_obs_batch)
            agent.normalizer_obs.update(next_obs_batch)
            next_obs_batch = []
    print('End initialize obs normalizer....')

    log_reward_ex = 0
    log_reward_in = 0
    log_step = 0
    log_episode = 0
    for i_epoch in range(args.max_epoch):
        epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob = [], [], [], [], [], [], []
        for i_step in range(args.rollout_len):
            actions, log_probs = agent.choose_action(obs)

            for action, parent_conn in zip(actions, parent_conns):
                parent_conn.send(action)

            batch_re, batch_mask, batch_next_obs = [], [], []
            for parent_conn in parent_conns:
                obs_, r_e, done, info = parent_conn.recv()

                batch_next_obs.append(obs_)
                batch_re.append(r_e)
                batch_mask.append(0 if done else 1)

            batch_next_obs = np.stack(batch_next_obs)
            batch_re = np.stack(batch_re)
            batch_mask = np.stack(batch_mask)
            batch_ri = agent.compute_intrinsic_reward(batch_next_obs.copy())

            #for log
            log_reward_ex += batch_re[args.log_env_idx]
            log_reward_in += batch_ri[args.log_env_idx]
            log_step += 1
            if batch_mask[args.log_env_idx] == 0:
                log_episode += 1
                logger.add_scalar('Indicator/Reward_ex', log_reward_ex,
                                  log_episode)
                logger.add_scalar('Indicator/Reward_in', log_reward_in,
                                  log_episode)
                log_reward_ex = 0
                log_reward_in = 0

            epoch_obs.append(obs)
            epoch_action.append(actions)
            epoch_next_obs.append(batch_next_obs)
            epoch_ri.append(batch_ri)
            epoch_re.append(batch_re)
            epoch_mask.append(batch_mask)
            epoch_logprob.append(log_probs)

            obs = batch_next_obs[:, :, :, :]

        epoch_obs = np.stack(epoch_obs)
        epoch_action = np.stack(epoch_action)
        epoch_ri = np.stack(epoch_ri)
        epoch_re = np.stack(epoch_re)
        epoch_mask = np.stack(epoch_mask)
        epoch_next_obs = np.stack(epoch_next_obs)
        epoch_logprob = np.stack(epoch_logprob)

        epoch_obs = np.transpose(epoch_obs, axes=[1, 0, 2, 3, 4])
        epoch_action = np.transpose(epoch_action, axes=[1, 0])
        epoch_ri = np.transpose(epoch_ri, axes=[1, 0])
        epoch_re = np.transpose(epoch_re, axes=[1, 0])
        epoch_mask = np.transpose(epoch_mask, axes=[1, 0])
        epoch_next_obs = np.transpose(epoch_next_obs, axes=[1, 0, 2, 3, 4])
        epoch_logprob = np.transpose(epoch_logprob, axes=[1, 0])

        loss_rnd, loss_a, loss_c = agent.update(epoch_obs, epoch_action,
                                                epoch_ri, epoch_re, epoch_mask,
                                                epoch_next_obs, epoch_logprob)

        used_sample_num = args.rollout_len * args.num_worker * i_epoch
        logger.add_scalar('Loss/loss_RND', loss_rnd, used_sample_num)
        logger.add_scalar('Loss/loss_a', loss_a, used_sample_num)
        logger.add_scalar('Loss/loss_c', loss_c, used_sample_num)

        if i_epoch % args.save_model_interval == 0:
            agent.save_model(remark='{}'.format(i_epoch))
Example #24
0
def main():
    parser = argparse.ArgumentParser(description='Reinforce')
    parser.add_argument('--data',
                        type=str,
                        default=config.data_dir,
                        help='location of the data corpus')
    parser.add_argument('--unk_threshold',
                        type=int,
                        default=config.unk_threshold,
                        help='minimum word frequency to be in dictionary')
    parser.add_argument('--alice_model_file',
                        type=str,
                        help='Alice model file')
    parser.add_argument('--bob_model_file', type=str, help='Bob model file')
    parser.add_argument('--output_model_file',
                        type=str,
                        help='output model file')
    parser.add_argument('--context_file', type=str, help='context file')
    parser.add_argument('--temperature',
                        type=float,
                        default=config.rl_temperature,
                        help='temperature')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=config.cuda,
                        help='use CUDA')
    parser.add_argument('--verbose',
                        action='store_true',
                        default=config.verbose,
                        help='print out converations')
    parser.add_argument('--seed',
                        type=int,
                        default=config.seed,
                        help='random seed')
    parser.add_argument(
        '--score_threshold',
        type=int,
        default=config.rl_score_threshold,
        help='successful dialog should have more than score_threshold in score'
    )
    parser.add_argument('--log_file',
                        type=str,
                        default='',
                        help='log successful dialogs to file for training')
    parser.add_argument('--smart_bob',
                        action='store_true',
                        default=False,
                        help='make Bob smart again')
    parser.add_argument('--gamma',
                        type=float,
                        default=config.rl_gamma,
                        help='discount factor')
    parser.add_argument('--eps',
                        type=float,
                        default=config.rl_eps,
                        help='eps greedy')
    parser.add_argument('--nesterov',
                        action='store_true',
                        default=config.nesterov,
                        help='enable nesterov momentum')
    parser.add_argument('--momentum',
                        type=float,
                        default=config.rl_momentum,
                        help='momentum for sgd')
    parser.add_argument('--lr',
                        type=float,
                        default=config.rl_lr,
                        help='learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=config.rl_clip,
                        help='gradient clip')
    parser.add_argument('--rl_lr',
                        type=float,
                        default=config.rl_reinforcement_lr,
                        help='RL learning rate')
    parser.add_argument('--rl_clip',
                        type=float,
                        default=config.rl_reinforcement_clip,
                        help='RL gradient clip')
    parser.add_argument('--ref_text',
                        type=str,
                        help='file with the reference text')
    parser.add_argument('--bsz',
                        type=int,
                        default=config.rl_bsz,
                        help='batch size')
    parser.add_argument('--sv_train_freq',
                        type=int,
                        default=config.rl_sv_train_freq,
                        help='supervision train frequency')
    parser.add_argument('--nepoch',
                        type=int,
                        default=config.rl_nepoch,
                        help='number of epochs')
    parser.add_argument('--visual',
                        action='store_true',
                        default=config.plot_graphs,
                        help='plot graphs')
    parser.add_argument('--domain',
                        type=str,
                        default=config.domain,
                        help='domain for the dialogue')
    parser.add_argument('--eps_clip',
                        type=float,
                        default=0.2,
                        help='clipping threshold for PPO surrogate loss 2')
    parser.add_argument('--ppo_epochs',
                        type=int,
                        default=5,
                        help='Number of epochs to perform PPO policy update')
    # TODO: split policy update epochs from supervised model update
    args = parser.parse_args()

    device_id = utils.use_cuda(args.cuda)
    logging.info("Starting training using pytorch version:%s" %
                 (str(torch.__version__)))
    logging.info("CUDA is %s" % ("enabled. Using device_id:"+str(device_id) + " version:" \
        +str(torch.version.cuda) + " on gpu:" + torch.cuda.get_device_name(0) if args.cuda else "disabled"))

    alice_model = utils.load_model(args.alice_model_file)
    # we don't want to use Dropout during RL
    alice_model.eval()
    # Alice is a RL based agent, meaning that she will be learning while selfplaying
    logging.info("Creating RlAgent from alice_model: %s" %
                 (args.alice_model_file))
    alice = PPOAgent(alice_model, args, name="Alice")

    # we keep Bob frozen, i.e. we don't update his parameters
    logging.info("Creating Bob's (--smart_bob) LstmRolloutAgent" if args.smart_bob \
        else "Creating Bob's (not --smart_bob) LstmAgent" )
    bob_ty = LstmRolloutAgent if args.smart_bob else LstmAgent
    bob_model = utils.load_model(args.bob_model_file)
    bob_model.eval()
    bob = bob_ty(bob_model, args, name='Bob')

    logging.info("Initializing communication dialogue between Alice and Bob")
    dialog = Dialog([alice, bob], args)
    logger = DialogLogger(verbose=args.verbose, log_file=args.log_file)
    ctx_gen = ContextGenerator(args.context_file)

    logging.info(
        "Building word corpus, requiring minimum word frequency of %d for dictionary"
        % (args.unk_threshold))
    corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold)
    engine = Engine(alice_model, args, device_id, verbose=False)

    logging.info("Starting Reinforcement Learning")
    reinforce = PPO(dialog, ctx_gen, args, engine, corpus, logger)
    reinforce.run()

    logging.info("Saving updated Alice model to %s" % (args.output_model_file))
    utils.save_model(alice.model, args.output_model_file)