Ejemplo n.º 1
0
    def __init__(self, *, task_name):
        logger.info(f"Executing training...")

        tmp_env = get_env(record=False)
        self.is_done = tmp_env.unwrapped.is_done
        self.eval_tasks = {task_name: tmp_env.tasks()[task_name]}
        self.exploitation_task = tmp_env.tasks()[task_name]
        del tmp_env

        # Constitute the state of Trainable
        ex.step_i = 0
        self.model = get_model()
        self.reward_model = get_reward_model()
        self.model_optimizer = get_model_optimizer(self.model.parameters())
        self.reward_model_optimizer = get_reward_model_optimizer(
            self.reward_model.parameters())
        self.buffer = get_buffer()
        self.agent = get_agent(mode='train')
        self.agent.setup_normalizer(self.buffer.normalizer)
        self.stats = EpisodeStats(self.eval_tasks)
        self.last_avg_eval_score = None
        self.neptune_ex = None
        ex.mlog = None

        # Not considered part of the state
        self.new_experiment = True  # I need to know if I had to create a new experiment (neptune) or continue an old one
        self.random_agent = get_random_agent()

        self._common_setup()
Ejemplo n.º 2
0
def run_episode(env, agent, deterministic, do_training=True, rendering=False, max_timesteps=1000):
    """
    This methods runs one episode for a gym environment. 
    deterministic == True => agent executes only greedy actions according the Q function approximator (no random actions).
    do_training == True => train agent
    """
    
    stats = EpisodeStats()        # save statistics like episode reward or action usage
    state = env.reset()

    step = 0
    while True:
        
        action_id = agent.act(state=state, deterministic=deterministic)
        next_state, reward, terminal, info = env.step(action_id)

        if do_training:  
            agent.train(state, action_id, next_state, reward, terminal)

        stats.step(reward, action_id)

        state = next_state
        
        if rendering:
            env.render()

        if terminal or step > max_timesteps: 
            break

        step += 1

    return stats
Ejemplo n.º 3
0
def run_episode(env, agent, deterministic, skip_frames=0,  do_training=True, rendering=False, max_timesteps=1000, history_length=0):
    """
    This methods runs one episode for a gym environment. 
    deterministic == True => agent executes only greedy actions according the Q function approximator (no random actions).
    do_training == True => train agent
    """

    stats = EpisodeStats()

    # Save history
    image_hist = []

    step = 0
    state = env.reset()

    # fix bug of corrupted states without rendering in gym environment
    env.viewer.window.dispatch_events() 

    # append image history to first state
    state = state_preprocessing(state)
    image_hist.extend([state] * (history_length + 1))
    state = np.array(image_hist).reshape(96, 96, history_length + 1)
    
    while True:

        # TODO: get action_id from agent
        # Hint: adapt the probabilities of the 5 actions for random sampling so that the agent explores properly. 
        # action_id = agent.act(...)
        # action = your_id_to_action_method(...)

        # Hint: frame skipping might help you to get better results.
        reward = 0
        for _ in range(skip_frames + 1):
            next_state, r, terminal, info = env.step(action)
            reward += r

            if rendering:
                env.render()

            if terminal: 
                 break

        next_state = state_preprocessing(next_state)
        image_hist.append(next_state)
        image_hist.pop(0)
        next_state = np.array(image_hist).reshape(96, 96, history_length + 1)

        if do_training:
            agent.train(state, action_id, next_state, reward, terminal)

        stats.step(reward, action_id)

        state = next_state
        
        if terminal or (step * (skip_frames + 1)) > max_timesteps : 
            break

        step += 1

    return stats
Ejemplo n.º 4
0
def test_agent(env,
               agent,
               run=0,
               episodes=5,
               time_steps=500,
               initial_state=None,
               initial_noise=None,
               render=True,
               deterministic=True):

    stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                         episode_rewards=np.zeros(episodes),
                         episode_loss=np.zeros(episodes))

    print_header(3, 'Testing')

    for e in range(episodes):

        s = env.reset(initial_state=initial_state,
                      noise_amplitude=initial_noise)

        for t in range(time_steps):

            if render:
                env.render()

            a = agent.get_action(s, deterministic=deterministic)
            s, r, d, _ = env.step(tn(a))

            stats.episode_rewards[e] += r
            stats.episode_lengths[e] = t

            if d:
                break

        pr_stats = {
            'run': run,
            'steps': int(stats.episode_lengths[e] + 1),
            'episode': e + 1,
            'episodes': episodes,
            'reward': stats.episode_rewards[e]
        }
        print_stats(pr_stats)

    if render:
        env.viewer.close()

    return stats
Ejemplo n.º 5
0
	def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5):

		stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes),
							 episode_loss=np.zeros(episodes))

		self._run += 1

		for e in range(episodes):

			s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise)

			for t in range(time_steps):

				a = self._actor.get_action(s, deterministic=False)
				ns, r, d, _ = env.step(tn(a))

				stats.episode_rewards[e] += r
				stats.episode_lengths[e] = t

				self._steps += 1
				self._replay_buffer.add_transition(s, a, ns, r, d)

				# Sample replay buffer
				b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(self._batch_size)

				# Get action according to target actor policy
				b_nactions = self._actor_target.get_action(b_nstates, deterministic=False)

				# Compute the target Q value from target critic
				target_Q1, target_Q2 = self._critic_target(b_nstates, b_nactions)
				target_Q = torch.min(target_Q1, target_Q2).reshape((-1))
				target_Q = b_rewards + (1 - b_terminal) * self._gamma * target_Q
				target_Q = target_Q.reshape((-1, 1)).detach()

				# Get current Q estimates from critic
				current_Q1, current_Q2 = self._critic(b_states, b_actions)

				# Compute critic loss
				critic_loss = self._critic_loss(current_Q1, target_Q) + self._critic_loss(current_Q2, target_Q)

				stats.episode_loss[e] += critic_loss.item()

				# Optimize the critic
				self._critic_optimizer.zero_grad()
				critic_loss.backward()
				self._critic_optimizer.step()

				# Delayed policy updates
				if self._steps % self._policy_freq == 0:

					# Compute actor losses by the deterministic policy gradient
					actor_loss = -self._critic.Q1(b_states, self._actor.get_action(b_states, deterministic=True)).mean()

					# Optimize the actor
					self._actor_optimizer.zero_grad()
					actor_loss.backward()
					self._actor_optimizer.step()

					# Soft-Update the target models
					soft_update(self._critic_target, self._critic, self._tau)
					soft_update(self._actor_target, self._actor, self._tau)

				if d:
					break
				s = ns

			pr_stats = {'run': self._run, 'steps': int(stats.episode_lengths[e] + 1),
						'episode': e + 1, 'episodes': episodes,
						'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e]}
			print_stats(pr_stats)

		return stats
Ejemplo n.º 6
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)
        bestscore = 0
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        fig = plt.figure()
        ax = fig.add_subplot(111)
        x, y = [0], [0]

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            episode_stats = EpisodeStats(self.n_steps, self.n_envs)
            t_first_start = time.time()
            n_updates = total_timesteps // self.n_batch

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0, (
                    "The number of minibatches (`nminibatches`) "
                    "is not a factor of the total number of samples "
                    "collected per rollout (`n_batch`), "
                    "some samples won't be used.")
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # Unpack
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout
                callback.update_locals(locals())
                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break
                episode_stats.feed(true_reward, masks)
                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = max(
                        self.n_batch // self.nminibatches // self.noptepochs,
                        1)
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (epoch_num * self.n_batch + start) //
                                batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = max(
                        self.n_batch // self.nminibatches // self.noptepochs //
                        self.n_steps, 1)
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (epoch_num * self.n_envs + start) //
                                envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose == 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    #if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                    #logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                    #logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                    logger.logkv("mean_episode_length",
                                 episode_stats.mean_length())
                    logger.logkv("mean_episode_reward",
                                 episode_stats.mean_reward())
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()
                if self.verbose == 2 and (
                        update % log_interval == 0 or update
                        == 1) and episode_stats.mean_reward() > bestscore:
                    bestscore = episode_stats.mean_reward()
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    logger.logkv("mean_episode_reward", bestscore)
                    logger.dumpkvs()
                    x.append(self.num_timesteps)
                    y.append(bestscore)
                    ax.plot(x, y, marker='.', color='b')
                    fig.canvas.draw()
                    ax.set_xlim(left=0, right=total_timesteps)
                    ax.set(title='Street Fighter 2 AI - PPO2 Algorithm',
                           ylabel='Fitness score',
                           xlabel='Timesteps')
                    fig.show()
                    plt.pause(0.001)
            callback.on_training_end()
            return self
Ejemplo n.º 7
0
def learn(seed,
          policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.1,
          next_n=10,
          nslupdates=10,
          seq_len=10,
          ext_coef=1,
          int_coef=0.1,
          K=10):

    rng = np.random.RandomState(seed)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    loc_space = 2
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nbatch_sl_train = nenvs * seq_len // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               loc_space=loc_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nbatch_sl_train=nbatch_sl_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               seq_len=seq_len,
                               seed=seed)
    model = make_model()

    replay_buffer = Buffer(max_size=1000, seed=seed)
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    next_n=next_n,
                    seq_len=seq_len,
                    int_coef=int_coef,
                    ext_coef=ext_coef,
                    replay_buffer=replay_buffer,
                    seed=seed)
    episode_raw_stats = EpisodeStats(nsteps, nenvs)
    episode_stats = EpisodeStats(nsteps, nenvs)
    tfirststart = time.time()
    nupdates = total_timesteps // nbatch
    sl_acc = 0
    p = 0
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        p = update * nbatch / (total_timesteps * 0.875)
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        obs, locs, goals, raw_rewards, rewards, returns, masks, rnn_masks, actions, values, neglogpacs, states = runner.run(
            K, p)
        episode_raw_stats.feed(raw_rewards, masks)
        episode_stats.feed(rewards, masks)
        mblossvals = []
        assert nenvs % nminibatches == 0
        envsperbatch = nenvs // nminibatches
        envinds = np.arange(nenvs)
        flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
        envsperbatch = nbatch_train // nsteps
        for _ in range(noptepochs):
            rng.shuffle(envinds)
            for start in range(0, nenvs, envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                mbflatinds = flatinds[mbenvinds].ravel()
                slices = (arr[mbflatinds]
                          for arr in (obs, locs, goals, returns, rnn_masks,
                                      actions, values, neglogpacs))
                mbstates = states[mbenvinds]
                mblossvals.append(model.train(lr, cliprange, *slices,
                                              mbstates))

        if nslupdates > 0 and sl_acc < 0.75:
            sl_acc, sl_loss = sl_train(model,
                                       replay_buffer,
                                       nslupdates=nslupdates,
                                       seq_len=seq_len,
                                       nenvs=nenvs,
                                       envsperbatch=envsperbatch,
                                       lr=lr)
        elif nslupdates > 0:
            sl_acc, sl_loss = sl_train(model,
                                       replay_buffer,
                                       nslupdates=1,
                                       seq_len=seq_len,
                                       nenvs=nenvs,
                                       envsperbatch=envsperbatch,
                                       lr=lr)

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        logger.logkv("serial_timesteps", update * nsteps)
        logger.logkv("nupdates", update)
        logger.logkv("total_timesteps", update * nbatch)
        logger.logkv("fps", fps)
        logger.logkv('episode_raw_reward', episode_raw_stats.mean_reward())
        logger.logkv('imitation_episode_reward',
                     np.mean(runner.recent_imitation_rewards))
        logger.logkv('episode_reward', episode_stats.mean_reward())
        logger.logkv('episode_success_ratio',
                     np.mean(runner.recent_success_ratio))
        logger.logkv('time_elapsed', tnow - tfirststart)
        if nslupdates > 0:
            logger.logkv('sl_loss', sl_loss)
            logger.logkv('sl_acc', sl_acc)
        logger.logkv('replay_buffer_num', replay_buffer.num_episodes())
        logger.logkv('replay_buffer_best', replay_buffer.max_reward())
        if noptepochs > 0:
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
        logger.dumpkvs()
        print(logger.get_dir())
    env.close()
    return model
Ejemplo n.º 8
0
class MainTrainingLoop:
    """ Resembles ray.Trainable """
    @ex.capture
    def __init__(self, *, task_name):
        logger.info(f"Executing training...")

        tmp_env = get_env(record=False)
        self.is_done = tmp_env.unwrapped.is_done
        self.eval_tasks = {task_name: tmp_env.tasks()[task_name]}
        self.exploitation_task = tmp_env.tasks()[task_name]
        del tmp_env

        # Constitute the state of Trainable
        ex.step_i = 0
        self.model = get_model()
        self.reward_model = get_reward_model()
        self.model_optimizer = get_model_optimizer(self.model.parameters())
        self.reward_model_optimizer = get_reward_model_optimizer(
            self.reward_model.parameters())
        self.buffer = get_buffer()
        self.agent = get_agent(mode='train')
        self.agent.setup_normalizer(self.buffer.normalizer)
        self.stats = EpisodeStats(self.eval_tasks)
        self.last_avg_eval_score = None
        self.neptune_ex = None
        ex.mlog = None

        # Not considered part of the state
        self.new_experiment = True  # I need to know if I had to create a new experiment (neptune) or continue an old one
        self.random_agent = get_random_agent()

        self._common_setup()

    @ex.capture
    def _common_setup(self, *, render, record, dump_dir, _run):
        """ Called in __init__ but needs also to be called after restore (due to reinitialized randomness) """
        video_file_base = dump_dir + "/max_exploitation_step_{}.mp4" if dump_dir is not None else None
        self.env_loop = EnvLoop(get_env,
                                render=render,
                                record=record,
                                video_file_base=video_file_base,
                                run=_run)

    def _setup_if_new(self):
        """ Executed for a new experiment only. This is a workaround for Trainable. """
        if self.new_experiment:
            self.new_experiment = False
            self.neptune_ex = get_neptune_ex()
            ex.mlog = MetricLogger(ex, self.neptune_ex)

    @ex.capture
    def train(self, *, device, n_total_steps, n_warm_up_steps, record_freq,
              record, model_training_freq, policy_training_freq, eval_freq,
              task_name, model_training_n_batches, train_reward):
        """ A single step of interaction with the environment. """
        self._setup_if_new()

        ex.step_i += 1

        behavioral_agent = self.random_agent if ex.step_i <= n_warm_up_steps else self.agent
        with torch.no_grad():
            action = behavioral_agent.get_action(self.env_loop.state,
                                                 deterministic=False).to('cpu')
        prev_state = self.env_loop.state.clone().to(device)
        if record and (ex.step_i == 1 or ex.step_i % record_freq == 0):
            self.env_loop.record_next_episode()
        state, next_state, done = self.env_loop.step(
            to_np(action), video_file_suffix=ex.step_i)
        reward = self.exploitation_task(state, action, next_state).item()
        self.buffer.add(state, action, next_state,
                        torch.from_numpy(np.array([[reward]], dtype=np.float)))
        self.stats.add(state, action, next_state, done)
        if done:
            log_last_episode(self.stats)

        tasks_rewards = {
            f'{task_name}': self.stats.get_recent_reward(task_name)
            for task_name in self.eval_tasks
        }
        step_stats = dict(
            step=ex.step_i,
            done=done,
            action_abs_mean=action.abs().mean().item(),
            reward=self.exploitation_task(state, action, next_state).item(),
            action_value=self.agent.get_action_value(prev_state,
                                                     action).item(),
        )
        ex.mlog.add_scalars('main_loop', {**step_stats, **tasks_rewards})

        # (Re)train the model on the current buffer
        if model_training_freq is not None and model_training_n_batches > 0 and ex.step_i % model_training_freq == 0:
            self.model.setup_normalizer(self.buffer.normalizer)
            self.reward_model.setup_normalizer(self.buffer.normalizer)
            timed(train_model)(self.model,
                               self.model_optimizer,
                               self.buffer,
                               mode='train')
            if train_reward:
                task = self.exploitation_task
                timed(train_reward_model)(self.reward_model,
                                          self.reward_model_optimizer,
                                          self.buffer,
                                          mode='train',
                                          task=task)

        # (Re)train the policy using current buffer and model
        if ex.step_i >= n_warm_up_steps and ex.step_i % policy_training_freq == 0:
            task = self.exploitation_task
            self.agent.setup_normalizer(self.buffer.normalizer)
            self.agent = timed(train_agent)(self.agent,
                                            self.model,
                                            self.reward_model,
                                            self.buffer,
                                            task=task,
                                            task_name=task_name,
                                            is_done=self.is_done,
                                            mode='train',
                                            context_i={})

        # Evaluate the agent
        if eval_freq is not None and ex.step_i % eval_freq == 0:
            self.last_avg_eval_score = evaluate_on_tasks(agent=self.agent,
                                                         model=self.model,
                                                         buffer=self.buffer,
                                                         task_name=task_name,
                                                         context='eval')

        experiment_finished = ex.step_i >= n_total_steps
        return DotMap(
            done=experiment_finished,
            avg_eval_score=self.last_avg_eval_score,
            action_abs_mean=action.abs().mean().item(
            ),  # This is just for regression tests
            step_i=ex.step_i)

    def stop(self):
        self.env_loop.close()
        if ex.mlog is not None:
            ex.mlog.save_artifacts()
            if ex.mlog.neptune_ex is not None:
                logger.info("Stopping neptune...")
                ex.mlog.neptune_ex.stop()
Ejemplo n.º 9
0
    def train(self,
              env,
              episodes,
              time_steps,
              initial_state=None,
              initial_noise=0.5):

        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes),
                             episode_loss=np.zeros(episodes))

        self._run += 1

        for e in range(episodes):
            # Generate an episode.
            # An episode is an array of (state, action, reward) tuples
            episode = []
            s = env.reset(initial_state=initial_state,
                          noise_amplitude=initial_noise)

            total_r = 0
            for t in range(time_steps):
                a = self._get_action(s)
                ns, r, d, _ = env.step(tn(self._action_fun.act2env(a)))

                stats.episode_rewards[e] += r
                stats.episode_lengths[e] = t

                episode.append((s, a, r))

                total_r += r

                if d:
                    break
                s = ns

            gamma_t = 1
            for t in range(len(episode)):
                # Find the first occurrence of the state in the episode
                s, a, r = episode[t]

                g = 0
                gamma_kt = 1
                for k in range(t, len(episode)):
                    gamma_kt = gamma_kt * self._gamma
                    _, _, r_k = episode[k]
                    g = g + (gamma_kt * r_k)

                g = float(g)

                p = self._pi(s, a)

                # For Numerical Stability, in order to not get probabilities higher than one (e.g. delta distribution)
                # and to not return a probability equal to 0 because of the log in the score_function
                eps = 1e-8
                p = p.clamp(eps, 1)

                log_p = torch.log(p)

                gamma_t = gamma_t * self._gamma

                if self._baseline:
                    bl = self.baseline_fun(s)
                    delta = g - bl

                    bl_loss = self._bl_loss_function(self.baseline_fun(s),
                                                     tt([g]))

                    self._bl_optimizer.zero_grad()
                    bl_loss.backward()
                    self._bl_optimizer.step()

                    score_fun = torch.mean(-(gamma_t * delta) * log_p)
                else:
                    score_fun = torch.mean(-(gamma_t * g) * log_p)

                stats.episode_loss[e] += score_fun.item()

                self._pi_optimizer.zero_grad()
                score_fun.backward()
                self._pi_optimizer.step()

            pr_stats = {
                'run': self._run,
                'steps': int(stats.episode_lengths[e] + 1),
                'episode': e + 1,
                'episodes': episodes,
                'reward': stats.episode_rewards[e],
                'loss': stats.episode_loss[e]
            }
            print_stats(pr_stats)

        return stats
Ejemplo n.º 10
0
def learn(policy,
          env,
          seed,
          ob_space,
          ac_space,
          save_name,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100):
    set_global_seeds(seed)

    nenvs = env.num_envs
    #ob_space = env.observation_space
    #ac_space = env.action_space
    save_dir = './model/' + save_name + '.ckt'
    summary_dir = './summary/' + save_name

    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  summary_dir=summary_dir)
    runner = Runner(env, model, ob_space=ob_space, nsteps=nsteps, gamma=gamma)

    nbatch = nenvs * nsteps
    tstart = time.time()
    train_writer = model.train_writer

    episode_stats = EpisodeStats(nsteps, nenvs)
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, raw_rewards = runner.run(
        )
        episode_stats.feed(raw_rewards, masks)
        mean_reward = episode_stats.mean_reward()
        mean_reward = np.asarray(mean_reward, dtype=np.float32)

        policy_loss, value_loss, policy_entropy, summary = model.train(
            obs, states, mean_reward, rewards, masks, actions, values)
        train_writer.add_summary(summary, update * nbatch)

        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("episode_reward",
                                  episode_stats.mean_reward())
            logger.record_tabular("episode_length",
                                  episode_stats.mean_length())
            logger.dump_tabular()
            model.save(save_dir)
    env.close()
    return model
Ejemplo n.º 11
0
def learn(policy,
          env,
          seed,
          nsteps=5,
          nstack=4,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100,
          max_episode_length=None,
          optimizer=None):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes)  # HACK
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  nstack=nstack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  optimizer=optimizer)
    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)

    stats = EpisodeStats(nsteps, nenvs, maxlen=100)
    nbatch = nenvs * nsteps
    tstart = time.time()
    for update in itertools.count():
        obs, states, rewards, masks, actions, values = runner.run()
        total_loss, policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        stats.feed(rewards, masks)

        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("total_loss", float(total_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("mean_episode_length", stats.mean_length())
            logger.record_tabular("mean_episode_reward", stats.mean_reward())

            logger.dump_tabular()

            if max_episode_length and stats.mean_length(
            ) >= max_episode_length:
                break
    env.close()
Ejemplo n.º 12
0
    def train(self,
              env,
              episodes,
              time_steps,
              initial_state=None,
              initial_noise=0.5):

        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes),
                             episode_loss=np.zeros(episodes))

        self._run += 1

        for e in range(episodes):

            s = env.reset(initial_state=initial_state,
                          noise_amplitude=initial_noise)
            total_r = 0

            # Step policy for advancing the scheduler
            epsilon = self._pi.epsilon()
            # print("\t\t\tStep: {:5d} Epsilon: {:6.5f}".format(t, epsilon))
            self._pi.step()

            for t in range(time_steps):

                a = self._get_action(s)
                ns, r, d, _ = env.step(self._action_fun.act2env(a))

                stats.episode_rewards[e] += r
                stats.episode_lengths[e] = t

                total_r += r

                if self._use_rbuffer:
                    self._replay_buffer.add_transition(s, a, ns, r, d)
                    b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(
                        self._batch_size)
                    dim = 1
                else:
                    b_states = s
                    b_actions = a
                    b_nstates = ns
                    b_rewards = r
                    b_terminal = d
                    dim = 0

                if self._doubleQ:

                    # Q-Values from next states [Q] used only to determine the optima next actions
                    q_nstates = self._q(b_nstates)
                    # Optimal Action Prediction  [Q]
                    nactions = torch.argmax(q_nstates, dim=dim)
                    if self._use_rbuffer:
                        nactions = [
                            torch.arange(self._batch_size).long(), nactions
                        ]

                    # Q-Values from [Q_target] function using the action indices from [Q] function
                    q_target_nstates = self._q_target(b_nstates)[nactions]

                else:
                    q_target_nstates = self._q_target(b_nstates)
                    q_target_nstates = torch.max(q_target_nstates, dim=dim)

                target_prediction = b_rewards + (
                    1 - b_terminal) * self._gamma * q_target_nstates

                if self._use_rbuffer:
                    q_actions = [
                        torch.arange(self._batch_size).long(),
                        b_actions.long()
                    ]
                else:
                    q_actions = b_actions

                current_prediction = self._q(b_states)[q_actions]

                loss = self._loss_function(current_prediction,
                                           target_prediction.detach())

                stats.episode_loss[e] += loss.item()

                self._q_optimizer.zero_grad()
                loss.backward()
                self._q_optimizer.step()

                soft_update(self._q_target, self._q, self._tau)

                if d:
                    break
                s = ns

            pr_stats = {
                'run': self._run,
                'steps': int(stats.episode_lengths[e] + 1),
                'episode': e + 1,
                'episodes': episodes,
                'reward': stats.episode_rewards[e],
                'loss': stats.episode_loss[e]
            }
            print_stats(pr_stats, ', Epsilon: {:6.5f}'.format(epsilon))

        return stats