コード例 #1
0
def populate_replay_buffer(replay_buffer, action_sampler, env):
    print("Populating replay memory...")
    state = env.reset()
    state = StatePreprocessor.process(state)
    done = False
    for t in itertools.count():
        if done:
            break
        action_probs = action_sampler(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        print("Step {step} state: {state}, action: {action}.".format(
            step=t, state=state, action=action))
        next_state, reward, done = env.execute(action=action)
        next_state = StatePreprocessor.process(next_state)
        replay_buffer.push(state, action, next_state, done, reward)
        state = next_state
コード例 #2
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    num_steps,
                    experiment_dir,
                    replay_memory_size=5000,
                    update_target_estimator_every=500,
                    discount_factor=0.999,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=10000,
                    update_q_values_every=4,
                    batch_size=32,
                    restore=True):

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    reward_dir = os.path.join(experiment_dir, "rewards")
    if not os.path.exists(reward_dir):
        os.makedirs(reward_dir)
    reward_writer = tf.summary.FileWriter(reward_dir)

    starting_episode = 0

    saver = tf.train.Saver()
    if restore:
        starting_episode = persistence.get_last_episode(reward_dir)
        # Load a previous checkpoint if we find one
        latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.train.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    replay_buffer = PrioritizedReplayBuffer(replay_memory_size,
                                            alpha=0.6,
                                            beta0=0.4,
                                            save_dir=experiment_dir)

    reward_shaper = ReplayRewardShaper('../replays/')
    reward_shaper.load()

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, reward_shaper,
                                        ACTION_SPACE)

    # Populate the replay memory with initial experience
    action_sampler = lambda state: policy(
        sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)])
    populate_replay_buffer(replay_buffer, action_sampler, env)

    print('Training is starting...')
    # Training the agent
    for i_episode in itertools.count(starting_episode):
        episode_reward = 0
        multiplier = 1

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = StatePreprocessor.process(state)
        done = False

        # One step in the environment
        for t in itertools.count():
            if total_t >= num_steps:
                return

            eps = epsilons[min(total_t, epsilon_decay_steps - 1)]

            if done or len(state) != STATE_SPACE:
                print("Finished episode with reward", episode_reward)
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag="rewards",
                                     simple_value=episode_reward)
                ])
                reward_writer.add_summary(summary, i_episode)
                summary = tf.Summary(
                    value=[tf.Summary.Value(tag="eps", simple_value=eps)])
                reward_writer.add_summary(summary, i_episode)
                break

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Take a step
            action_probs = policy(sess, state, eps)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done = env.execute(action=action)
            next_state = StatePreprocessor.process(next_state)

            episode_reward += reward * multiplier
            multiplier *= discount_factor

            # Save transition to replay memory
            replay_buffer.push(state, action, next_state, done, reward)

            if total_t % update_q_values_every == 0:
                # Sample a minibatch from the replay memory
                samples, idx = replay_buffer.sample(batch_size)
                states, actions, next_states, dones, rewards, _ = map(
                    np.array, zip(*samples))

                # Calculate q values and targets (Double DQN)
                next_q_values = q_estimator.predict(sess, next_states)
                for i in range(batch_size):
                    for action in range(ACTION_SPACE):
                        next_q_values[i][
                            action] += reward_shaper.get_potential(
                                next_states[i], action)
                next_actions = np.argmax(next_q_values, axis=1)

                next_q_values_target = target_estimator.predict(
                    sess, next_states)
                not_dones = np.invert(dones).astype(np.float32)

                targets = (
                    rewards + discount_factor *
                    reward_shaper.get_potentials(next_states, next_actions) -
                    reward_shaper.get_potentials(states, actions) +
                    discount_factor * not_dones *
                    next_q_values_target[np.arange(batch_size), next_actions])

                # Perform gradient descent update
                predictions = q_estimator.update(sess, states, actions,
                                                 targets)

                # Update transition priorities
                priors = np.abs(predictions - targets) + EPS_PRIORITY
                replay_buffer.update_priorities(idx, priors)

            print("\rStep {}, episode {} ({}/{})".format(
                t, i_episode, total_t, num_steps),
                  end="\t")
            sys.stdout.flush()

            state = next_state
            total_t += 1
コード例 #3
0
def learn(env,
          network,
          seed=None,
          pool=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_initial_eps=1.0,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=100,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          experiment_name='unnamed',
          load_path=None,
          **network_kwargs):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    experiment_name: str
        name of the experiment (default: trial)
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=exploration_initial_eps,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    reward_shaper = ActionAdviceRewardShaper('../completed-observations')
    reward_shaper.load()

    full_exp_name = '{}-{}'.format(date.today().isoformat(), experiment_name)
    experiment_dir = os.path.join('experiments', full_exp_name)
    if not os.path.exists(experiment_dir):
        os.makedirs(experiment_dir)

    summary_dir = os.path.join(experiment_dir, 'summaries')
    os.makedirs(summary_dir, exist_ok=True)
    summary_writer = tf.summary.FileWriter(summary_dir)

    checkpoint_dir = os.path.join(experiment_dir, 'checkpoints')
    os.makedirs(checkpoint_dir, exist_ok=True)

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_dir or td

        os.makedirs(td, exist_ok=True)
        model_file = os.path.join(td, "best_model")
        model_saved = False
        saved_mean_reward = None

        if os.path.exists(model_file):
            print('Model is loading')
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        episode_rewards = []
        update_step_t = 0
        while update_step_t < total_timesteps:
            # Reset the environment
            obs = env.reset()
            obs = StatePreprocessor.process(obs)
            episode_rewards.append(0.0)
            reset = True
            done = False
            # Sample the episode until it is completed
            act_step_t = update_step_t
            while not done:
                if callback is not None:
                    if callback(locals(), globals()):
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not param_noise:
                    update_eps = exploration.value(act_step_t)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(act_step_t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(act_step_t) +
                        exploration.value(act_step_t) /
                        float(env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                biases = reward_shaper.get_action_potentials(obs)
                action = act(np.array(obs)[None],
                             biases,
                             update_eps=update_eps,
                             **kwargs)[0]
                reset = False

                pairs = env.step(action)
                action, (new_obs, rew, done, _) = pairs[-1]
                # Write down the real reward but learn from normalized version
                episode_rewards[-1] += rew
                rew = np.sign(rew) * np.log(1 + np.abs(rew))
                new_obs = StatePreprocessor.process(new_obs)

                logger.log('{}/{} obs {} action {}'.format(
                    act_step_t, total_timesteps, obs, action))
                act_step_t += 1
                if len(new_obs) == 0:
                    done = True
                else:
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                    obs = new_obs
            # Post episode logging
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="rewards",
                                 simple_value=episode_rewards[-1])
            ])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(
                value=[tf.Summary.Value(tag="eps", simple_value=update_eps)])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="episode_steps",
                                 simple_value=act_step_t - update_step_t)
            ])
            summary_writer.add_summary(summary, act_step_t)
            mean_5ep_reward = round(np.mean(episode_rewards[-5:]), 1)
            num_episodes = len(episode_rewards)
            if print_freq is not None and num_episodes % print_freq == 0:
                logger.record_tabular("steps", act_step_t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 5 episode reward", mean_5ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(act_step_t)))
                logger.dump_tabular()
            # Do the learning
            start = time.time()
            while update_step_t < min(act_step_t, total_timesteps):
                if update_step_t % train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if prioritized_replay:
                        experience = replay_buffer.sample(
                            batch_size,
                            beta=beta_schedule.value(update_step_t))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                            batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    biases_t = pool.map(reward_shaper.get_action_potentials,
                                        obses_t)
                    biases_tp1 = pool.map(reward_shaper.get_action_potentials,
                                          obses_tp1)
                    td_errors, weighted_error = train(obses_t, biases_t,
                                                      actions, rewards,
                                                      obses_tp1, biases_tp1,
                                                      dones, weights)

                    # Loss logging
                    summary = tf.Summary(value=[
                        tf.Summary.Value(tag='weighted_error',
                                         simple_value=weighted_error)
                    ])
                    summary_writer.add_summary(summary, update_step_t)

                    if prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + prioritized_replay_eps
                        replay_buffer.update_priorities(
                            batch_idxes, new_priorities)
                if update_step_t % target_network_update_freq == 0:
                    # Update target network periodically.
                    update_target()
                update_step_t += 1
            stop = time.time()
            logger.log("Learning took {:.2f} seconds".format(stop - start))
            if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0:
                # Periodically save the model and the replay buffer
                rec_model_file = os.path.join(
                    td, "model_{}_{:.2f}".format(num_episodes,
                                                 mean_5ep_reward))
                save_variables(rec_model_file)
                buffer_file = os.path.join(
                    td, "buffer_{}_{}".format(num_episodes, update_step_t))
                with open(buffer_file, 'wb') as foutput:
                    cloudpickle.dump(replay_buffer, foutput)
                # Check whether it is best
                if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_5ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_5ep_reward

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_variables(model_file)

    return act
コード例 #4
0
ファイル: deepq.py プロジェクト: niksaz/dota2-expert-demo
def do_agent_exploration(updates_queue: multiprocessing.Queue,
                         q_func_vars_trained_queue: multiprocessing.Queue,
                         network, seed, config, lr, total_timesteps,
                         learning_starts, buffer_size, exploration_fraction,
                         exploration_initial_eps, exploration_final_eps,
                         train_freq, batch_size, print_freq, checkpoint_freq,
                         gamma, target_network_update_freq, prioritized_replay,
                         prioritized_replay_alpha, prioritized_replay_beta0,
                         prioritized_replay_beta_iters, prioritized_replay_eps,
                         experiment_name, load_path, network_kwargs):
    env = DotaEnvironment()

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, _, _, debug = deepq.build_train(
        scope='deepq_act',
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=exploration_initial_eps,
                                 final_p=exploration_final_eps)

    U.initialize()

    reward_shaper = ActionAdviceRewardShaper(config=config)
    reward_shaper.load()
    reward_shaper.generate_merged_demo()

    full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'),
                                   experiment_name)
    experiment_dir = os.path.join('experiments', full_exp_name)
    os.makedirs(experiment_dir, exist_ok=True)

    summary_dir = os.path.join(experiment_dir, 'summaries')
    os.makedirs(summary_dir, exist_ok=True)
    summary_writer = tf.summary.FileWriter(summary_dir)
    checkpoint_dir = os.path.join(experiment_dir, 'checkpoints')
    os.makedirs(checkpoint_dir, exist_ok=True)
    stats_dir = os.path.join(experiment_dir, 'stats')
    os.makedirs(stats_dir, exist_ok=True)

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_dir or td

        os.makedirs(td, exist_ok=True)
        model_file = os.path.join(td, "best_model")
        model_saved = False
        saved_mean_reward = None

        # if os.path.exists(model_file):
        #     print('Model is loading')
        #     load_variables(model_file)
        #     logger.log('Loaded model from {}'.format(model_file))
        #     model_saved = True
        # elif load_path is not None:
        #     load_variables(load_path)
        #     logger.log('Loaded model from {}'.format(load_path))

        def synchronize_q_func_vars():
            updates_queue.put(
                UpdateMessage(UPDATE_STATUS_SEND_WEIGHTS, None, None))
            q_func_vars_trained = q_func_vars_trained_queue.get()
            update_q_func_expr = []
            for var, var_trained in zip(debug['q_func_vars'],
                                        q_func_vars_trained):
                update_q_func_expr.append(var.assign(var_trained))
            update_q_func_expr = tf.group(*update_q_func_expr)
            sess.run(update_q_func_expr)

        synchronize_q_func_vars()

        episode_rewards = []
        act_step_t = 0
        while act_step_t < total_timesteps:
            # Reset the environment
            obs = env.reset()
            obs = StatePreprocessor.process(obs)
            episode_rewards.append(0.0)
            done = False
            # Demo preservation variables
            demo_picked = 0
            demo_picked_step = 0
            # Demo switching statistics
            demo_switching_stats = [(0, 0)]
            # Sample the episode until it is completed
            act_started_step_t = act_step_t
            while not done:
                # Take action and update exploration to the newest value
                biases, demo_indexes = reward_shaper.get_action_potentials_with_indexes(
                    obs, act_step_t)
                update_eps = exploration.value(act_step_t)
                actions, is_randoms = act(np.array(obs)[None],
                                          biases,
                                          update_eps=update_eps)
                action, is_random = actions[0], is_randoms[0]
                if not is_random:
                    bias_demo = demo_indexes[action]
                    if bias_demo != demo_switching_stats[-1][1]:
                        demo_switching_stats.append(
                            (act_step_t - act_started_step_t, bias_demo))
                    if bias_demo != 0 and demo_picked == 0:
                        demo_picked = bias_demo
                        demo_picked_step = act_step_t + 1
                pairs = env.step(action)
                action, (new_obs, rew, done, _) = pairs[-1]
                logger.log(
                    f'{act_step_t}/{total_timesteps} obs {obs} action {action}'
                )

                # Compute state on the real reward but learn from the normalized version
                episode_rewards[-1] += rew
                rew = np.sign(rew) * np.log(1 + np.abs(rew))
                new_obs = StatePreprocessor.process(new_obs)

                if len(new_obs) == 0:
                    done = True
                else:
                    transition = (obs, action, rew, new_obs, float(done),
                                  act_step_t)
                    obs = new_obs
                    act_step_t += 1
                    if act_step_t - demo_picked_step >= MIN_STEPS_TO_FOLLOW_DEMO_FOR:
                        demo_picked = 0
                    reward_shaper.set_demo_picked(act_step_t, demo_picked)
                    updates_queue.put(
                        UpdateMessage(UPDATE_STATUS_CONTINUE, transition,
                                      demo_picked))
            # Post episode logging
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="rewards",
                                 simple_value=episode_rewards[-1])
            ])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(
                value=[tf.Summary.Value(tag="eps", simple_value=update_eps)])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="episode_steps",
                                 simple_value=act_step_t - act_started_step_t)
            ])
            summary_writer.add_summary(summary, act_step_t)
            mean_5ep_reward = round(float(np.mean(episode_rewards[-5:])), 1)
            num_episodes = len(episode_rewards)
            if print_freq is not None and num_episodes % print_freq == 0:
                logger.record_tabular("steps", act_step_t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 5 episode reward", mean_5ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(act_step_t)))
                logger.dump_tabular()
            # Wait for the learning to finish and synchronize
            synchronize_q_func_vars()
            # Record demo_switching_stats
            if num_episodes % 10 == 0:
                save_demo_switching_stats(demo_switching_stats, stats_dir,
                                          num_episodes)
            if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0:
                # Periodically save the model
                rec_model_file = os.path.join(
                    td, "model_{}_{:.2f}".format(num_episodes,
                                                 mean_5ep_reward))
                save_variables(rec_model_file)
                # Check whether the model is the best so far
                if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_5ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_5ep_reward

        updates_queue.put(UpdateMessage(UPDATE_STATUS_FINISH, None, None))
コード例 #5
0
class ReplayRewardShaper:
    """ Provides potential-based reward shaping based on expert demonstrations.

    Uses replays to parse demonstrated state-action pairs and provides rewards
    based on them.

    Reference paper: https://www.ijcai.org/Proceedings/15/Papers/472.pdf.
    """
    def __init__(self, replay_dir):
        self.replay_dir = replay_dir
        self.state_preprocessor = StatePreprocessor()
        self.demos = []

    def load(self):
        for name in os.listdir(self.replay_dir):
            dump_path = os.path.join(self.replay_dir, name)
            with open(dump_path, 'rb') as dump_file:
                replay = pickle.load(dump_file)
            demo = self.__process_replay(replay)
            self.demos.append(demo)

    def __process_replay(self, replay):
        demo = []
        for i in range(len(replay) - 1):
            state0, action_state0 = replay[i]
            state1, _ = replay[i + 1]

            state0 = state0[STATE_PROJECT]
            state1 = state1[STATE_PROJECT]

            if action_state0[0] == 1:
                # attack the nearest creep
                action = ATTACK_CREEP
            elif action_state0[1] == 1:
                # attack the enemy hero
                action = ATTACK_HERO
            elif action_state0[2] == 1:
                # attack the enemy tower
                action = ATTACK_TOWER
            else:
                # try to move
                diff = state1[:2] - state0[:2]
                if np.linalg.norm(diff) == 0:
                    # position did not change; skip transition
                    continue
                angle_pi = math.atan2(diff[1], diff[0])
                if angle_pi < 0:
                    angle_pi += 2 * math.pi
                degrees = angle_pi / math.pi * 180
                action = round(degrees / (360 / MOVES_TOTAL)) % MOVES_TOTAL

            demo.append((self.state_preprocessor.process(state0), action,
                         self.state_preprocessor.process(state1)))
        return demo

    def get_potential(self, state, action):
        best_value = 0
        for demo in self.demos:
            for demo_state, demo_action, _ in demo:
                if demo_action != action:
                    continue
                diff = state - demo_state
                value = K * math.e**(-1 / 2 * diff.dot(SIGMA).dot(diff))
                if value > best_value:
                    best_value = value
        return best_value

    def get_potentials(self, states, actions):
        N = states.shape[0]
        potentials = np.zeros(N)
        for i in range(N):
            potentials[i] = self.get_potential(states[i], actions[i])
        return potentials

    def get_nearest_demo(self, state):
        best_norm = None
        action = None
        for demo in self.demos:
            for demo_state, demo_action, _ in demo:
                diff = state - demo_state
                norm = np.linalg.norm(diff)
                if action is None or norm < best_norm:
                    best_norm = norm
                    action = demo_action
        return action