Ejemplo n.º 1
0
 def __init__(self, model, buffer, log_interval):
     self.model = model
     self.buffer = buffer
     self.log_interval = log_interval
     self.tstart = None
     self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
     self.steps = None
Ejemplo n.º 2
0
    def __init__(self, runner, model, buffer, log_interval, evaluate_env,
                 evaluate_interval, evaluate_n, logdir):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

        self.evaluate_env = evaluate_env
        self.evaluate_interval = evaluate_interval
        self.evaluate_n = evaluate_n

        if logdir:
            self.summary_writer = tf.summary.FileWriter(logdir=logdir)
            self.logdir = logdir
            self.best_mean_reward = 0

            self.evaluation_f = open(logdir + '/evaluation_monitor.csv', "wt")
            self.evaluation_logger = csv.DictWriter(self.evaluation_f,
                                                    fieldnames=('r', 'l'))
            self.evaluation_logger.writeheader()
        else:
            self.summary_writer = None
Ejemplo n.º 3
0
    def __init__(self, runner, model, buffer, log_interval, stats_interval):
        """

        :param Runner runner:
        :param Model model:
        :param Buffer buffer:
        :param int log_interval:
        :param int stats_interval:
        """
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

        file_formatter = logging.Formatter('%(asctime)s %(message)s')
        stats_logger = logging.getLogger('stats_logger')
        stats_logger.setLevel(logging.INFO)
        # logger handlers
        stats_fh = logging.FileHandler(os.path.join(logger.get_dir(), 'results.log'))
        stats_fh.setFormatter(file_formatter)
        stats_logger.addHandler(stats_fh)

        self.stats_logger = stats_logger
        self.stats_interval = stats_interval
Ejemplo n.º 4
0
class Acer():
    def __init__(self, runner, model, buffer, log_interval, curiosity):
        self.runner = runner
        self.model = model
        self.curiosity = curiosity
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

    def call(self, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks, next_states, icm_actions = runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()


        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)
        # print("obs shape {} , next obs shape {} ".format(np.shape(obs),np.shape(next_states)))
        # next_states = next_states.reshape(runner.batch_ob_shape)
        
        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        if self.curiosity and on_policy:
            
            if on_policy :
                icm_actions = icm_actions.reshape([runner.batch_ob_shape[0]])
                next_states = next_states.reshape(runner.batch_ob_shape)

            names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps , on_policy=on_policy , next_states = next_states, icm_actions=icm_actions )

        else :
            names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps,on_policy=on_policy, next_states = None, icm_actions = None )

        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()
Ejemplo n.º 5
0
 def __init__(self, runner, model, buffer, log_interval , curiosity, icm):
     self.runner = runner
     self.curiosity = curiosity
     self.model = model
     self.buffer = buffer
     self.log_interval = log_interval
     self.tstart = None
     self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
     self.steps = None
     self.icm= icm
Ejemplo n.º 6
0
    def __init__(self, runner, model, buffer, log_interval):
        """
        Wrapper for the ACER model object

        :param runner: (AbstractEnvRunner) The runner to learn the policy of an environment for a model
        :param model: (Model) The model to learn
        :param buffer: (Buffer) The observation buffer
        :param log_interval: (int) The number of timesteps before logging.
        """
        super(Acer, self).__init__()
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.t_start = None
        self.episode_stats = EpisodeStats(runner.n_steps, runner.n_env)
        self.steps = None
Ejemplo n.º 7
0
 def __init__(self, runner, model, buffer, log_interval):
     self.runner = runner
     self.model = model
     self.buffer = buffer
     self.log_interval = log_interval
     self.tstart = None
     self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
     self.steps = None
Ejemplo n.º 8
0
def learn(policy, env, seed, ob_space, ac_space, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, sil_update=4, sil_beta=0.0, save_dir=None):
    set_global_seeds(seed)
    #ob_space = env.observation_space
    #ac_space = env.action_space
    # print('ac_space:',ac_space)
    nenvs = env.num_envs
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, sil_update=sil_update, sil_beta=sil_beta)
    runner = Runner(env, model, ob_space=ob_space, nsteps=nsteps, gamma=gamma)

    episode_stats = EpisodeStats(nsteps, nenvs)
    nbatch = nenvs * nsteps
    tstart = time.time()
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, raw_rewards = runner.run()
        episode_stats.feed(raw_rewards, masks)
        policy_loss, value_loss, policy_entropy, v_avg = model.train(
            obs, states, rewards, masks, actions, values)
        sil_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train()
        model.save(save_dir)

        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "episode_reward", episode_stats.mean_reward())
            logger.record_tabular("best_episode_reward",
                                  float(model.sil.get_best_reward()))
            if sil_update > 0:
                logger.record_tabular("sil_num_episodes",
                                      float(model.sil.num_episodes()))
                logger.record_tabular("sil_valid_samples", float(sil_samples))
                logger.record_tabular(
                    "sil_steps", float(model.sil.num_steps()))
            logger.dump_tabular()

    env.close()
    return model
Ejemplo n.º 9
0
class Acer():
    def __init__(self, runner, model, buffer, log_interval):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

    def call(self, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()


        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)
        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)

        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()
Ejemplo n.º 10
0
class Acer():
    def __init__(self, runner, model, buffer, log_interval, stats_interval):
        """

        :param Runner runner:
        :param Model model:
        :param Buffer buffer:
        :param int log_interval:
        :param int stats_interval:
        """
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

        file_formatter = logging.Formatter('%(asctime)s %(message)s')
        stats_logger = logging.getLogger('stats_logger')
        stats_logger.setLevel(logging.INFO)
        # logger handlers
        stats_fh = logging.FileHandler(os.path.join(logger.get_dir(), 'results.log'))
        stats_fh.setFormatter(file_formatter)
        stats_logger.addHandler(stats_fh)

        self.stats_logger = stats_logger
        self.stats_interval = stats_interval

    def call(self, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)
        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)

        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("time", time.strftime('%m-%d %H:%M'))
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
            logger.record_tabular("fph", '%.2fM' % ((steps/1e6)/((time.time() - self.tstart)/3600)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()

        if on_policy and (int(steps/runner.nbatch) % self.stats_interval == 0):
            if hasattr(self.runner.env, 'stats'):
                envs_stats = self.runner.env.stats()
                avg_stats = {}

                # keys_of_lists: for each key related to a list has a list in which the i-th element counts the number
                # of envs that has an i-th element in the list related to that key
                keys_of_lists = {}  # type: dict[str, list[int]]

                # init average stats
                for stats in envs_stats:
                    for key, val in stats.items():
                        if not key in avg_stats:
                            if isinstance(val, list):
                                avg_stats[key] = []
                                keys_of_lists[key] = []
                            else:
                                avg_stats[key] = 0

                # collect stats for each environment
                for stats in envs_stats:
                    for key, val in stats.items():
                        if isinstance(val, list):
                            avg_list = avg_stats[key]
                            counts = keys_of_lists[key]
                            len_diff = len(val) - len(counts)
                            if len_diff > 0:
                                counts.extend([0]*len_diff)
                                avg_list.extend([0]*len_diff)
                            for i, v in enumerate(val):
                                counts[i] += 1
                                avg_list[i] += v
                        else:
                            avg_stats[key] += val

                # average stats across envs
                for key, val in avg_stats.items():
                    if isinstance(val, list):
                        counts = keys_of_lists[key]
                        for i, v in enumerate(val):
                            val[i] = v / counts[i]
                    else:
                        avg_stats[key] = val / len(envs_stats)

                avg_stats['global_t'] = steps
                self.stats_logger.info(' '.join('%s=%s' % (key, val) for key, val in avg_stats.items()))
Ejemplo n.º 11
0
class Acer(object):
    def __init__(self, runner, model, buffer, log_interval):
        """
        Wrapper for the ACER model object

        :param runner: (AbstractEnvRunner) The runner to learn the policy of an environment for a model
        :param model: (Model) The model to learn
        :param buffer: (Buffer) The observation buffer
        :param log_interval: (int) The number of timesteps before logging.
        """
        super(Acer, self).__init__()
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.t_start = None
        self.episode_stats = EpisodeStats(runner.n_steps, runner.n_env)
        self.steps = None

    def call(self, on_policy):
        """
        Call a step with ACER

        :param on_policy: (bool) To step on policy and not on buffer
        """
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)
        actions = actions.reshape([runner.n_batch])
        rewards = rewards.reshape([runner.n_batch])
        mus = mus.reshape([runner.n_batch, runner.n_act])
        dones = dones.reshape([runner.n_batch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus,
                                            model.initial_state, masks, steps)

        if on_policy and (int(steps / runner.n_batch) % self.log_interval
                          == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps",
                                  int(steps / (time.time() - self.t_start)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life,
            # not just at the terminal state. Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length",
                                  self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward",
                                  self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()
Ejemplo n.º 12
0
class Acer():
    def __init__(self,
                 runner,
                 model,
                 buffer,
                 log_interval,
                 expert_buffer=None):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None
        self.expert_buffer = []

        #self.flag = 1

    def call(self, perform, save_networks, use_expert, expert, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        expert_buffer = self.expert_buffer

        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()

            runner.myrun()
            # if self.flag>0:
            # 	print(self.flag,'=================================')
            # 	print(enc_obs, obs, actions, rewards, mus, dones, masks)
            # 	self.flag = self.flag -1
            self.episode_stats.feed(rewards, dones)
            if buffer is not None and not perform:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()
            #enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()

        if not perform:
            # reshape stuff correctly
            obs = obs.reshape(runner.batch_ob_shape)
            actions = actions.reshape([runner.nbatch])
            rewards = rewards.reshape([runner.nbatch])
            mus = mus.reshape([runner.nbatch, runner.nact])
            dones = dones.reshape([runner.nbatch])
            masks = masks.reshape([runner.batch_ob_shape[0]])

            if not use_expert:
                names_ops, values_ops = model.train(obs, actions, rewards,
                                                    dones, mus,
                                                    model.initial_state, masks,
                                                    steps)
            else:
                expert_obs, expert_actions, expert_rewards, expert_mus, expert_dones, expert_masks = expert.get(
                )
                expert_obs = expert_obs.reshape(runner.batch_ob_shape)
                expert_actions = expert_actions.reshape([runner.nbatch])
                expert_rewards = expert_rewards.reshape([runner.nbatch])
                expert_mus = expert_mus.reshape([runner.nbatch, runner.nact])
                expert_dones = expert_dones.reshape([runner.nbatch])
                expert_masks = expert_masks.reshape([runner.batch_ob_shape[0]])
                names_ops, values_ops = model.expert_train(
                    obs, actions, rewards, dones, mus, model.initial_state,
                    masks, steps, expert_obs, expert_actions, expert_rewards,
                    expert_mus, expert_dones, expert_masks)

            if on_policy and (int(steps / runner.nbatch) % self.log_interval
                              == 0):
                logger.record_tabular("total_timesteps", steps)
                logger.record_tabular("fps",
                                      int(steps / (time.time() - self.tstart)))
                # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
                # Thus, this is mean until end of life, not end of episode.
                # For true episode rewards, see the monitor files in the log folder.
                logger.record_tabular("mean_episode_length",
                                      self.episode_stats.mean_length())
                logger.record_tabular("mean_episode_reward",
                                      self.episode_stats.mean_reward())
                for name, val in zip(names_ops, values_ops):
                    logger.record_tabular(name, float(val))
                logger.dump_tabular()

                if save_networks and (int(steps / runner.nbatch) %
                                      self.log_interval * 10 == 0):
                    model.save(int(steps))

        else:  #if perform
            expert_buffer.append(
                [enc_obs, actions, rewards, mus, dones, masks])

            if len(expert_buffer) > 0 and len(expert_buffer) % 100 == 0:
                expert_dir = os.path.join('./expert') + '/'
                if not os.path.exists(expert_dir):
                    os.makedirs(expert_dir)
                pwritefile = open(os.path.join(expert_dir, 'expert_test.pkl'),
                                  'wb')
                pickle.dump(expert_buffer, pwritefile, -1)
                pwritefile.close()
                logger.info('Successfully Saved the Expert Data')

            obs = obs.reshape(runner.batch_ob_shape)
            actions = actions.reshape([runner.nbatch])
            rewards = rewards.reshape([runner.nbatch])
            mus = mus.reshape([runner.nbatch, runner.nact])
            dones = dones.reshape([runner.nbatch])
            masks = masks.reshape([runner.batch_ob_shape[0]])
            if on_policy and (int(steps / runner.nbatch) % self.log_interval
                              == 0):
                logger.record_tabular("total_timesteps", steps)
                logger.record_tabular("fps",
                                      int(steps / (time.time() - self.tstart)))
                # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
                # Thus, this is mean until end of life, not end of episode.
                # For true episode rewards, see the monitor files in the log folder.
                logger.record_tabular("mean_episode_length",
                                      self.episode_stats.mean_length())
                logger.record_tabular("mean_episode_reward",
                                      self.episode_stats.mean_reward())

                logger.dump_tabular()
Ejemplo n.º 13
0
def learn(policy,
          env,
          seed,
          nsteps,
          nstack,
          total_timesteps,
          gamma,
          vf_coef,
          ent_coef,
          max_grad_norm,
          lr,
          lrschedule,
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          log_interval=100):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes)  # HACK

    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  nstack=nstack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule)
    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)

    episode_stats = EpisodeStats(nsteps, nenvs)

    nbatch = nenvs * nsteps
    tstart = time.time()

    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, raw_rewards, returns, masks, actions, values = runner.run(
        )
        ravg_norm_obs, policy_loss, value_loss, policy_entropy = model.train(
            obs, states, returns, masks, actions, values)
        #policy_loss, value_loss, policy_entropy = model.train(obs, states, returns, masks, actions, values)

        episode_stats.feed(raw_rewards, masks)
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("avg_norm_obs", float(ravg_norm_obs))
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss * vf_coef))
            logger.record_tabular("entropy_loss",
                                  float(-1 * policy_entropy * ent_coef))
            logger.record_tabular(
                "total_loss",
                float(policy_loss - policy_entropy * ent_coef +
                      value_loss * vf_coef))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("mean_episode_length",
                                  episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward",
                                  episode_stats.mean_reward())
            logger.dump_tabular()
    env.close()
Ejemplo n.º 14
0
def learn(policy,
          env,
          seed,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100):
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule)
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    episode_stats = EpisodeStats(nsteps, nenvs)
    nbatch = nenvs * nsteps
    tstart = time.time()
    global ifnext, a, b, obsuse, w
    #    ifnext,a,b,obsuse=runner.runset(0)
    #    np.save('runner2obsaver.npy',obsuse)
    b = 5197

    ifnext = 0
    global ifnext1, a1, b1, obsuse1
    obsuse = np.load('runner2obsaver.npy')
    obsuse1 = np.load('runner2obsaver.npy')
    b1 = 5387
    k = 0
    w = 0
    mean_r = np.zeros(1100000)
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values = runner.run()
        episode_stats.feed(rewards, masks)
        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if 2 * update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("episode_reward",
                                  episode_stats.mean_reward())
            logger.dump_tabular()
            k = k + 1
            mean_r[k] = episode_stats.mean_reward()
            np.save('mean_r.npy', mean_r)
            print(episode_stats.mean_reward())
    env.close()
    return model
Ejemplo n.º 15
0
class Acer():
    def __init__(self, runner, model, buffer, log_interval, evaluate_env,
                 evaluate_interval, evaluate_n, logdir):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

        self.evaluate_env = evaluate_env
        self.evaluate_interval = evaluate_interval
        self.evaluate_n = evaluate_n

        if logdir:
            self.summary_writer = tf.summary.FileWriter(logdir=logdir)
            self.logdir = logdir
            self.best_mean_reward = 0

            self.evaluation_f = open(logdir + '/evaluation_monitor.csv', "wt")
            self.evaluation_logger = csv.DictWriter(self.evaluation_f,
                                                    fieldnames=('r', 'l'))
            self.evaluation_logger.writeheader()
        else:
            self.summary_writer = None

    def call(self, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)
        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus,
                                            model.initial_state, masks, steps)

        if on_policy and (int(steps / runner.nbatch) % self.evaluate_interval
                          == 0) and self.summary_writer:
            rewards_mean, length_mean = self.evaluate(self.evaluate_env,
                                                      self.evaluate_n)
            # logger.record_tabular("mean_episode_length", rewards_mean)
            # logger.record_tabular("mean_episode_reward", length_mean)
            stats = tf.Summary(value=[
                tf.Summary.Value(tag="reward_mean", simple_value=rewards_mean),
                tf.Summary.Value(tag="length_mean", simple_value=length_mean),
            ], )
            self.summary_writer.add_summary(stats, steps)

            self.evaluation_logger.writerow({
                'r': rewards_mean,
                'l': length_mean
            })
            self.evaluation_f.flush()

            if rewards_mean > self.best_mean_reward:
                self.best_mean_reward = rewards_mean
                self.model.save(self.logdir + '/' + str(steps // 1e4) + '_' +
                                str(rewards_mean))

        if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps",
                                  int(steps / (time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length",
                                  self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward",
                                  self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()

    def evaluate(self, env, n):
        reward_total = 0
        length_total = 0
        for i in range(n):
            reward_episode, length_episode = self.runner.evaluate(env)
            reward_total += reward_episode
            length_total += length_episode

        reward_mean = reward_total / n
        length_mean = length_total / n
        return reward_mean, length_mean
Ejemplo n.º 16
0
class Acer():
    def __init__(self, model, buffer, log_interval):
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None

    def call(self, mini_batch, on_policy):
        model, buffer, steps = self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = mini_batch
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        # FIXME: Remove dependency to runner
        obs = obs.reshape(runner.batch_ob_shape)
        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus,
                                            model.initial_state, masks, steps)

        if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps",
                                  int(steps / (time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length",
                                  self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward",
                                  self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()

    def learn(self, mini_batch):
        if acer.tstart is None:
            acer.tstart = time.time()

        # nbatch samples, 1 on_policy call and multiple off-policy calls
        for acer.steps in range(0, total_timesteps, nbatch):
            acer.call(on_policy=True)
            if replay_ratio > 0 and buffer.has_atleast(replay_start):
                n = np.random.poisson(replay_ratio)
                for _ in range(n):
                    # no simulation steps in this
                    acer.call(mini_batch, on_policy=False)
            if save_interval and (acer.steps % save_interval == 0
                                  or acer.steps == 1) and logger.get_dir():
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
                os.makedirs(checkdir, exist_ok=True)
                savepath = osp.join(checkdir, '%.8i' % acer.steps)
                print('Saving to', savepath)
                model.save(savepath)
Ejemplo n.º 17
0
def learn(policy,
          env,
          seed,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100,
          sil_update=4,
          sil_beta=0.0):
    set_global_seeds(seed)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  sil_update=sil_update,
                  sil_beta=sil_beta)
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    episode_stats = EpisodeStats(nsteps, nenvs)
    nbatch = nenvs * nsteps
    tstart = time.time()

    global ifnext, a, b, obsuse, w
    #    ifnext,a,b,obsuse=runner.runset(0)
    #    np.save('runner2obsaver.npy',obsuse)
    b = 5197
    a = 4925
    ifnext = 0
    global ifnext1, a1, b1, obsuse1
    obsuse = np.load('runner2obsaver.npy')
    obsuse1 = np.load('runner2obsaver.npy')
    b1 = 5387
    print(ifnext, a, b)
    k = 0
    w = 0
    mean_r = np.zeros(1100000)
    best_r = np.zeros(1100000)
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, raw_rewards = runner.run(
        )
        #        obs,raw_rewards,gelabel=obsaver(obs,raw_rewards,gelabel,masks)
        #        rewards = np.sign(raw_rewards)
        #        if sum(raw_rewards)>99:
        #            print(raw_rewards,masks)
        episode_stats.feed(raw_rewards, masks)
        policy_loss, value_loss, policy_entropy, v_avg = model.train(
            obs, states, rewards, masks, actions, values)
        sil_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train()
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if 2 * update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            #print(values,rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("episode_reward",
                                  episode_stats.mean_reward())

            logger.record_tabular("best_episode_reward",
                                  float(model.sil.get_best_reward()))
            mean_r[k] = episode_stats.mean_reward()
            best_r[k] = float(model.sil.get_best_reward())
            print(episode_stats.mean_reward(),
                  float(model.sil.get_best_reward()))
            k = k + 1
            np.save('mean_r.npy', mean_r)
            np.save('best_r.npy', best_r)
            if sil_update > 0:
                logger.record_tabular("sil_num_episodes",
                                      float(model.sil.num_episodes()))
                logger.record_tabular("sil_valid_samples", float(sil_samples))
                logger.record_tabular("sil_steps",
                                      float(model.sil.num_steps()))
            logger.dump_tabular()


#            if mean_r[k]>0.8 and k>4:#完成很高,基本收敛
#                global ifnext1,a1,b1,obsuse1
#                ifnext1,a1,b1,obsuse1=runner.runset(best_r[k])
#                print(best_r[k])
#                np.save('runner2obsaver1.npy',obsuse1)
#                print(ifnext1,a1,b1)
#                w=w+1

    env.close()
    return model
Ejemplo n.º 18
0
class Acer():
    def __init__(self, runner, model, buffer, log_interval , curiosity, icm):
        self.runner = runner
        self.curiosity = curiosity
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None
        self.icm= icm

    def call(self, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            # if its not empty that next_state is contain states
            # print("\n\n\n !!! its on policy !!! \n\n\n") 

            # enc_obs, enc_next_obs , obs, actions, rewards, mus, dones, masks, next_states, icm_actions , icm_rewards = runner.run()
            enc_obs, enc_next_obs , obs, actions, rewards, mus, dones, masks, next_states, icm_actions = runner.run()
            
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                # buffer.put(enc_obs, enc_next_obs ,actions, rewards, mus, dones, masks, icm_actions , icm_rewards)
                buffer.put(enc_obs, enc_next_obs ,actions, rewards, mus, dones, masks, icm_actions )
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            # print("\n\n~~~~ now its off Policy ~~~\n\n")
            # obs, next_obs ,actions, rewards, mus, dones, masks, icm_actions, icm_rewards = buffer.get()
            obs, next_obs ,actions, rewards, mus, dones, masks, icm_actions= buffer.get()





        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)

        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        if self.icm is not None :

            # print("2 icm Called here ")

            # icm_rewards = icm_rewards.reshape([runner.batch_ob_shape[0]])
            icm_actions =  icm_actions.reshape([runner.batch_ob_shape[0]])

            if on_policy == False:
                next_states = next_obs.reshape(runner.batch_ob_shape)
            else :    
                next_states = next_states.reshape(runner.batch_ob_shape)



            names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps , next_states, icm_actions )
        else :
            names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps , next_states = None, icm_actions = None  )



        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()
Ejemplo n.º 19
0
class Acer():
    def __init__(self, runner, model, buffer, log_interval, eval_env):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
        self.steps = None
        self.eval_env = eval_env

    def call(self, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        obs = obs.reshape(runner.batch_ob_shape)
        actions = actions.reshape([runner.nbatch])
        rewards = rewards.reshape([runner.nbatch])
        mus = mus.reshape([runner.nbatch, runner.nact])
        dones = dones.reshape([runner.nbatch])
        masks = masks.reshape([runner.batch_ob_shape[0]])

        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus,
                                            model.initial_state, masks, steps)

        if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0):
            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            eval_obs = self.eval_env.reset()
            epilen = 0
            epinfos = []
            if self.eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(10000000):
                    eval_action, eval_q, _, _ = self.model.step(eval_obs)
                    eval_obs, eval_r, eval_done, eval_info = self.eval_env.step(
                        eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_episode_reward += eval_r
                    for info in eval_info:
                        maybeepinfo = info.get('episode')
                        if maybeepinfo: epinfos.append(maybeepinfo)
                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            epilen += 1
                    if epilen >= 10:
                        break
            if self.eval_env is not None:
                logger.record_tabular(
                    'eval_eplenmean',
                    np.mean(self.safemean([epinfo['l']
                                           for epinfo in epinfos])))
                logger.record_tabular(
                    'eval_eprewmean',
                    np.mean(self.safemean([epinfo['r']
                                           for epinfo in epinfos])))
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps",
                                  int(steps / (time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
            # Thus, this is mean until end of life, not end of episode.
            # For true episode rewards, see the monitor files in the log folder.
            logger.record_tabular("mean_episode_length",
                                  self.episode_stats.mean_length())
            logger.record_tabular("mean_episode_reward",
                                  self.episode_stats.mean_reward())
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()

    def safemean(self, xs):
        return np.nan if len(xs) == 0 else np.mean(xs)