Beispiel #1
0
def td_learning(args):
    agent = DQNAgent(args)
    replay_memory = PrioritizedReplayBuffer(1000000, args.alpha)
    #eval_game(agent, 500)
    outer = tqdm(range(args.total_steps), desc='Total steps', position=0)
    game = init_game()
    ave_score = 0
    count = 0
    for step in outer:
        board = copy.deepcopy(game.gameboard.board)
        if step < args.start_learn:
            avail_choices = game.gameboard.get_available_choices()
            index = np.random.randint(len(avail_choices))
            choice = avail_choices[index]
        else:
            choice = agent.greedy_policy(
                board, game.gameboard.get_available_choices())

        next_board, reward = game.input_pos(choice[0], choice[1])
        next_board = copy.deepcopy(next_board)
        #####

        replay_memory.add(board, choice, reward, next_board)
        #####
        if game.termination():
            ave_score += game.gameboard.score
            count += 1
            game = init_game()

        if step >= args.start_learn and step % args.train_freq == 0:
            if count > 0:
                message = "ave score of " + str(count) + " game: " + str(
                    ave_score / count)
                out_fd.write("{} {}\n".format(step, ave_score / count))
                outer.write(message)
                ave_score = 0
                count = 0
            if step == args.start_learn:
                experience = replay_memory.sample(args.start_learn,
                                                  beta=agent.beta)
            else:
                experience = replay_memory.sample(args.train_data_size,
                                                  beta=agent.beta)

            boards, choices, rewards, next_boards, weights, batch_idxes = experience

            td_errors = agent.train(
                (boards, choices, rewards, next_boards, weights))
            new_priorities = np.abs(td_errors) + prioritized_replay_eps
            replay_memory.update_priorities(batch_idxes, new_priorities)

            agent.update_target(args.soft_tau)
            agent.update_epsilon()
            agent.update_beta()

    eval_game(agent, 500)
    out_fd.close()
Beispiel #2
0
def learn(env, args):
    ob = env.reset()
    ob_shape = ob.shape
    num_action = int(env.action_space.n)

    agent = TestAgent(ob_shape, num_action, args)
    replay_buffer = PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha)
    args.prioritized_replay_beta_iters = args.max_timesteps
    beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, 
                                    initial_p=args.prioritized_replay_beta0, 
                                    final_p=1.0)

    episode_rewards = [0.0]
    saved_mean_reward = None
    n_step_seq = []

    agent.sample_noise()
    agent.update_target()

    for t in range(args.max_timesteps):
        action = agent.act(ob)
        new_ob, rew, done, _ = env.step(action)
        replay_buffer.add(ob, action, rew, new_ob, float(done))
        ob = new_ob

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
            reset = True

        if t > args.learning_starts and t % args.replay_period == 0:
            experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t))
            (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience
            agent.sample_noise()
            kl_errors = agent.update(obs, actions, rewards, obs_next, dones, weights)
            replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6)

        if t > args.learning_starts and t % args.target_network_update_freq == 0:
            agent.update_target()  

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0:
            print('steps {} episodes {} mean reward {}'.format(t, num_episodes, mean_100ep_reward))
class Agent:

    def __init__(self, net, actionSet, goalSet, defaultNSample, defaultRandomPlaySteps, controllerMemCap, explorationSteps, trainFreq, hard_update,
                 controllerEpsilon=defaultControllerEpsilon):
        self.actionSet = actionSet
        self.controllerEpsilon = controllerEpsilon
        self.goalSet = goalSet
        self.nSamples = defaultNSample 
        self.gamma = defaultGamma
        self.net = net
        self.memory = PrioritizedReplayBuffer(controllerMemCap, alpha=prioritized_replay_alpha)
        self.enable_double_dqn = True
        self.exploration = LinearSchedule(schedule_timesteps = explorationSteps, initial_p = 1.0, final_p = 0.02)
        self.defaultRandomPlaySteps = defaultRandomPlaySteps
        self.trainFreq = trainFreq
        self.randomPlay = True
        self.learning_done = False
        self.hard_update = hard_update

    def selectMove(self, state):
        if not self.learning_done:
            if self.controllerEpsilon < random.random():
                return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0))
                #return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4)), dummyYtrue, dummyMask], verbose=0)[1])
            return random.choice(self.actionSet)
        else:
            return np.argmax(self.simple_net.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0))

    def setControllerEpsilon(self, epsilonArr):
        self.controllerEpsilon = epsilonArr

    def criticize(self, reachGoal, action, die, distanceReward, useSparseReward):
        reward = 0.0
        if reachGoal:
            reward += 1.0
            #reward += 50.0
        if die:
            reward -= 1.0
        if not useSparseReward:
            reward += distanceReward
        reward = np.minimum(reward, maxReward)
        reward = np.maximum(reward, minReward)
        return reward

    def store(self, experience):
        self.memory.add(experience.state, experience.action, experience.reward, experience.next_state, experience.done)
        #self.memory.add(np.abs(experience.reward), experience)

    def compile(self):
        def huber_loss(y_true, y_pred, clip_value):
            assert clip_value > 0.

            x = y_true - y_pred
            if np.isinf(clip_value):
                return .5 * K.square(x)

            condition = K.abs(x) < clip_value
            squared_loss = .5 * K.square(x)
            linear_loss = clip_value * (K.abs(x) - .5 * clip_value)
            if K.backend() == 'tensorflow':
                import tensorflow as tf
                if hasattr(tf, 'select'):
                    return tf.select(condition, squared_loss, linear_loss)  # condition, true, false
                else:
                    return tf.where(condition, squared_loss, linear_loss)  # condition, true, false
            elif K.backend() == 'theano':
                from theano import tensor as T
                return T.switch(condition, squared_loss, linear_loss)
            else:
                raise RuntimeError('Unknown backend "{}".'.format(K.backend()))

            
        def clipped_masked_error(args):
                y_true, y_pred, mask = args
                loss = huber_loss(y_true, y_pred, 1)
                loss *= mask  # apply element-wise mask
                return K.sum(loss, axis=-1)
        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
        y_pred = self.net.controllerNet.output
        y_true = Input(name='y_true', shape=(nb_Action,))
        mask = Input(name='mask', shape=(nb_Action,))
        loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask])
        ins = [self.net.controllerNet.input] if type(self.net.controllerNet.input) is not list else self.net.controllerNet.input
        trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
        assert len(trainable_model.output_names) == 2
        #combined_metrics = {trainable_model.output_names[1]: metrics}
        losses = [
            lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
            lambda y_true, y_pred: K.zeros_like(y_pred),  # we only include this for the metrics
        ]
        rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0)
        trainable_model.compile(optimizer=rmsProp, loss=losses)
        self.trainable_model = trainable_model
        self.compiled = True

    def _update(self, stepCount):
        batches = self.memory.sample(self.nSamples, beta=beta_schedule.value(stepCount))
        (stateVector, actionVector, rewardVector, nextStateVector, doneVector, importanceVector, idxVector) = batches
        
        stateVector = np.asarray(stateVector)
        nextStateVector = np.asarray(nextStateVector)
        
        q_values = self.net.controllerNet.predict(stateVector)
        assert q_values.shape == (self.nSamples, nb_Action)
        if self.enable_double_dqn:
            actions = np.argmax(q_values, axis = 1)
            assert actions.shape == (self.nSamples,)

            target_q_values = self.net.targetControllerNet.predict(nextStateVector)
            assert target_q_values.shape == (self.nSamples, nb_Action)
            q_batch = target_q_values[range(self.nSamples), actions]
            assert q_batch.shape == (self.nSamples,)
        else:
            target_q_values = self.net.targetControllerNet.predict(nextStateVector)
            q_batch = np.max(target_q_values, axis=1)
            assert q_batch.shape == (self.nSamples,)

        targets = np.zeros((self.nSamples, nb_Action))
        dummy_targets = np.zeros((self.nSamples,))
        masks = np.zeros((self.nSamples, nb_Action))

        # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly,
        # but only for the affected output units (as given by action_batch).
        discounted_reward_batch = self.gamma * q_batch
        # Set discounted reward to zero for all states that were terminal.
        terminalBatch = np.array([1-float(done) for done in doneVector])
        assert terminalBatch.shape == (self.nSamples,)
        discounted_reward_batch *= terminalBatch
        reward_batch = np.array(rewardVector)
        action_batch = np.array(actionVector)
        assert discounted_reward_batch.shape == reward_batch.shape
        Rs = reward_batch + discounted_reward_batch
        for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)):
            target[action] = R  # update action with estimated accumulated reward
            dummy_targets[idx] = R
            mask[action] = 1.  # enable loss for this specific action
        td_errors = targets[range(self.nSamples), action_batch] - q_values[range(self.nSamples), action_batch]
        
        new_priorities = np.abs(td_errors) + prioritized_replay_eps
        self.memory.update_priorities(idxVector, new_priorities)
        
        targets = np.array(targets).astype('float32')
        masks = np.array(masks).astype('float32')

        
        # Finally, perform a single update on the entire batch. We use a dummy target since
        # the actual loss is computed in a Lambda layer that needs more complex input. However,
        # it is still useful to know the actual target to compute metrics properly.
        ins = [stateVector] if type(self.net.controllerNet.input) is not list else stateVector
        if stepCount >= self.defaultRandomPlaySteps:
            loss = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets], sample_weight = [np.array(importanceVector), np.ones(self.nSamples)])
        else:
            loss = [0.0,0.0,0.0]
        
        if stepCount > self.defaultRandomPlaySteps and stepCount % self.hard_update == 0:
            self.net.targetControllerNet.set_weights(self.net.controllerNet.get_weights())
        return loss[1], np.mean(q_values), np.mean(np.abs(td_errors))
        

    def update(self, stepCount):
        loss = self._update(stepCount)
        return loss

    def annealControllerEpsilon(self, stepCount, option_learned):
        if not self.randomPlay:
            if option_learned:
                self.controllerEpsilon = 0.0
            else:
                if stepCount > self.defaultRandomPlaySteps:
                    self.controllerEpsilon = self.exploration.value(stepCount - self.defaultRandomPlaySteps)
                    #self.controllerEpsilon[goal] = exploration.value(stepCount - defaultRandomPlaySteps)
    def clear_memory(self, goal):
        self.learning_done = True ## Set the done learning flag
        del self.trainable_model
        del self.memory

        gpu = self.net.gpu

        del self.net

        gc.collect()

        rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0)

        with tf.device('/gpu:'+str(gpu)):
            self.simple_net = Sequential()
            self.simple_net.add(Conv2D(32, (8,8), strides = 4, activation = 'relu', padding = 'valid', input_shape = (84,84,4)))
            self.simple_net.add(Conv2D(64, (4,4), strides = 2, activation = 'relu', padding = 'valid'))
            self.simple_net.add(Conv2D(64, (3,3), strides = 1, activation = 'relu', padding = 'valid'))
            self.simple_net.add(Flatten())
            self.simple_net.add(Dense(HIDDEN_NODES, activation = 'relu', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED)))
            self.simple_net.add(Dense(nb_Action, activation = 'linear', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED)))
            self.simple_net.compile(loss = 'mse', optimizer = rmsProp)
            self.simple_net.load_weights(recordFolder+'/policy_subgoal_' + str(goal) + '.h5')
            self.simple_net.reset_states()
Beispiel #4
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          callback=None,
          tf_log_dir=None,
          tf_flush_freq=100,
          tf_model_freq=10000
          ):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    # inject some Tensorboard usage.
    tf_summary_writer = tf.summary.FileWriter('{}/summary'.format(tf_log_dir)) if tf_log_dir is not None else None
    tf_saver = tf.train.Saver(max_to_keep=10) if tf_log_dir is not None else None

    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        print('====', model_file)
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done and tf_summary_writer is not None:
                summary = tf.Summary()
                summary.value.add(tag='info/episode_reward', simple_value=float(episode_rewards[-1]))
                summary.value.add(tag='info/esp', simple_value=float(update_eps))
                tf_summary_writer.add_summary(summary, t)

            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

                if tf_summary_writer is not None:
                    summary = tf.Summary()
                    summary.value.add(tag='model/loss', simple_value=float(td_errors[0]))  # TODO: mean the loss
                    tf_summary_writer.add_summary(summary, t)

            if t % tf_flush_freq == 0:
                tf_summary_writer.flush()

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                        logger.log("Saving model path: {}".format(model_file))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
            if tf_saver is not None and t % tf_model_freq == 0:
                assert tf_log_dir is not None
                tf_saver.save(sess=sess, save_path='{}/model/model'.format(tf_log_dir), global_step=t)

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          train_mode = True,
          **network_kwargs
            ):
    """Train a deepq model.
    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.

    """
    # Examine environment parameters
    print(str(env))
    # Set the default brain to work with
    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    num_actions=brain.vector_action_space_size[0]
    
    # Create all the functions necessary to train the model

    sess = get_session()
    #set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    #observation_space = env.observation_space

    
    env_info = env.reset(train_mode=train_mode)[default_brain]

    state = get_obs_state_lidar(env_info)

    observation_space=state.copy()
    
    
    #def make_obs_ph(name,Num_action):

    #    tf.placeholder(shape=(None,) + state.shape, dtype=state.dtype, name='st')
        
    #    return tf.placeholder(tf.float32, shape = [None, Num_action],name=name)
    

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug =build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))


        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_variables(model_file)

    return act
Beispiel #6
0
class DuelingDoubleDQNagent():
    def __init__(self):
        #self.action_space = [0, 1, 2, 3, 4, 5, 6]
        self.action_space = [i for i in range(4 * 7)
                             ]  # 28 grouped action : board 7x14
        self.action_size = len(self.action_space)
        self.next_stone_size = 6
        self.state_size = (rows + 1, cols, 1)
        self.discount_factor = 0.99

        # 딥마인드의 논문에서는 PER을 사용하여 샘플링한 데이터는 학습되는 양이 크기 때문에
        # 학습의 안정성을 위해 Learning rate를 기존 random uniform sample을 사용했을 때의 1/4 수준으로 줄였기에 이를 반영했습니다.
        #self.learning_rate = 0.00025
        self.learning_rate = 0.0000625

        self.epsilon = 0.  #1.
        self.epsilon_min = 0.0
        self.epsilon_decay = 1000000  #1000000

        self.model = self.build_model()
        self.target_model = self.build_model()

        # custom loss function을 따로 정의하여 학습에 사용합니다.
        self.model_updater = self.model_optimizer()

        self.batch_size = 64
        self.train_start = 50000  #50000

        # PER 선언 및 관련 hyper parameter입니다.

        # beta는 importance sampling ratio를 얼마나 반영할지에 대한 수치입니다.
        # 정확한 의미는 아니지만 정말 추상적으로 설명드리면
        # beta가 크다 -> PER을 사용함으로써 생기는 데이터 편향을 크게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 감소, 전체적인 학습은 조금더 안정적
        # beta가 작다 -> PER을 사용함으로써 생기는 데이터 편향을 작게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 증가, 전체적인 학습은 조금더 불안정
        # 논문에서는 초기 beta를 0.4로 두고 학습이 끝날때까지 선형적으로 1까지 증가시킴.

        # alpha는 TD-error의 크기를 어느정도로 반영할지에 대한 파라미터입니다. 수식으로는 (TD-error)^alpha 로 표현됩니다.
        # alpha가 0에 가까울수록 TD-error의 크기를 반영하지 않는 것이고 기존의 uniform sampling에 가까워집니다.
        # alpha가 1에 가까울수록 TD-error의 크기를 반영하는 것이고 PER에 가까워집니다.
        # 논문에서는 alpha를 0.6으로 사용했습니다.

        # prioritized_replay_eps는 (TD-error)^alpha를 계산할때 TD-error가 0인 상황을 방지하기위해 TD-error에 더 해주는 아주작은 상수값 입니다.

        self.memory = PrioritizedReplayBuffer(1000000, alpha=0.6)  #1000000
        self.beta = 0.4  # 0.4
        self.beta_max = 1.0
        self.beta_decay = 2000000  #5000000
        self.prioritized_replay_eps = 0.000001

        # 텐서보드 설정
        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('summary/tetris_dqn',
                                                    self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        self.load_model = True
        if self.load_model:
            self.model.load_weights("./DQN_tetris_model_0311.h5")

        self.imitation_mode = False

    # 각 에피소드 당 학습 정보를 기록
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Total Clear Line/Episode', episode_avg_max_q)
        #tf.summary.scalar('Duration/Episode', episode_duration)
        #tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
        #tf.train.AdamOptimizer
        summary_vars = [
            episode_total_reward, episode_avg_max_q, episode_duration,
            episode_avg_loss
        ]
        summary_placeholders = [
            tf.placeholder(tf.float32) for _ in range(len(summary_vars))
        ]
        update_ops = [
            summary_vars[i].assign(summary_placeholders[i])
            for i in range(len(summary_vars))
        ]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def build_model(self):

        # Dueling DQN

        state = Input(shape=(
            self.state_size[0],
            self.state_size[1],
            self.state_size[2],
        ))
        layer = Conv2D(32, (5, 5),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(state)  # 64, (4, 4)
        layer = Conv2D(32, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        layer = Conv2D(32, (1, 1),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        layer = Conv2D(32, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        layer = Conv2D(32, (1, 1),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        pool_1 = MaxPooling2D(pool_size=(3, 3),
                              strides=(1, 1),
                              padding='valid',
                              data_format=None)(layer)

        layer_2 = Conv2D(64, (3, 3),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(pool_1)  ##
        layer_2 = Conv2D(32, (1, 1),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(layer_2)  ##
        layer_2 = Conv2D(64, (3, 3),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(layer_2)
        pool_2 = MaxPooling2D(pool_size=(2, 2),
                              strides=(1, 1),
                              padding='valid',
                              data_format=None)(layer_2)

        layer_r = Conv2D(32, (rows + 1, 1),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(state)
        layer_c = Conv2D(32, (1, cols),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(state)

        pool_1_r = Conv2D(32, (13, 1),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_1)
        pool_1_c = Conv2D(32, (1, 5),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_1)

        pool_2_r = Conv2D(32, (12, 1),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_2)
        pool_2_c = Conv2D(32, (1, 4),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_2)

        layer = Flatten()(layer)
        layer_2 = Flatten()(layer_2)
        pool_1 = Flatten()(pool_1)
        pool_2 = Flatten()(pool_2)
        layer_r = Flatten()(layer_r)
        layer_c = Flatten()(layer_c)
        pool_1_r = Flatten()(pool_1_r)
        pool_1_c = Flatten()(pool_1_c)
        pool_2_r = Flatten()(pool_2_r)
        pool_2_c = Flatten()(pool_2_c)

        merge_layer = concatenate([
            layer, layer_2, pool_1, pool_2, pool_1_c, pool_1_r, pool_2_c,
            pool_2_r, layer_c, layer_r
        ],
                                  axis=1)
        merge_layer = Dense(128,
                            activation='relu',
                            kernel_initializer='he_uniform')(merge_layer)

        vlayer = Dense(64, activation='relu',
                       kernel_initializer='he_uniform')(merge_layer)
        alayer = Dense(64, activation='relu',
                       kernel_initializer='he_uniform')(merge_layer)
        v = Dense(1, activation='linear',
                  kernel_initializer='he_uniform')(vlayer)
        v = Lambda(lambda v: tf.tile(v, [1, self.action_size]))(v)
        a = Dense(self.action_size,
                  activation='linear',
                  kernel_initializer='he_uniform')(alayer)
        a = Lambda(lambda a: a - tf.reduce_mean(a, axis=-1, keep_dims=True))(a)
        q = Add()([v, a])
        model = Model(inputs=state, outputs=q)
        # custom loss 및 optimizer를 사용할 것이기에 complie 부분은 주석처리 합니다.
        # model.compile(loss='logcosh', optimizer=Adam(lr=self.learning_rate))
        model.summary()

        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        '''
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            return np.argmax(q_values[0])


    def get_action(self, env, state):
        if np.random.rand() <= self.epsilon:
            if env.new_stone_flag:
                return random.randrange(4)
            else:
                return random.randrange(self.action_size)
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            return np.argmax(q_values[0])
    '''

    def get_action(self, env, state):
        if np.random.rand() <= self.epsilon:
            if env.stone_number(env.stone) == 1:
                return random.randrange(14)
            elif env.stone_number(env.stone) == 4 or env.stone_number(
                    env.stone) == 6:
                return random.randrange(2) * 7 + random.randrange(6)
            elif env.stone_number(env.stone) == 2 or env.stone_number(
                    env.stone) == 5 or env.stone_number(env.stone) == 7:
                return random.randrange(4) * 7 + random.randrange(6)
            elif env.stone_number(env.stone) == 3:
                return random.randrange(6)
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            r_action = np.argmax(q_values[0])

            return np.argmax(q_values[0])

    def model_optimizer(self):
        target = K.placeholder(shape=[None, self.action_size])
        weight = K.placeholder(shape=[
            None,
        ])

        # hubber loss에 대한 코드입니다.

        clip_delta = 1.0

        pred = self.model.output

        err = target - pred

        cond = K.abs(err) < clip_delta

        squared_loss = 0.5 * K.square(err)
        linear_loss = clip_delta * (K.abs(err) - 0.5 * clip_delta)

        loss1 = tf.where(cond, squared_loss, linear_loss)

        # 기존 hubber loss에 importance sampling ratio를 곱하는 형태의 PER loss를 정의합니다.
        weighted_loss = tf.multiply(tf.expand_dims(weight, -1), loss1)

        loss = K.mean(weighted_loss, axis=-1)

        optimizer = Adam(lr=self.learning_rate)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)

        train = K.function([self.model.input, target, weight], [err],
                           updates=updates)

        return train

    def train_model(self):

        (update_input, action, reward, update_target, done, weight,
         batch_idxes) = self.memory.sample(self.batch_size, beta=self.beta)

        target = self.model.predict(update_input)
        target_val = self.target_model.predict(update_target)
        target_val_arg = self.model.predict(update_target)

        # Double DQN
        for i in range(self.batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                a = np.argmax(target_val_arg[i])
                target[i][action[
                    i]] = reward[i] + self.discount_factor * target_val[i][a]

        # PER에서 mini-batch로 샘플링한 데이터에 대해 학습을 진행합니다.
        # 학습을 하는 과정에서 새롭게 계산된 TD-error를 다시 반영하기 위해 err는 따로 출력하여 저장합니다.
        err = self.model_updater([update_input, target, weight])

        err = np.reshape(err, [self.batch_size, self.action_size])

        # TD-error가 0이 되는것을 방지하기위해 작은 상수를 더해줍니다.
        new_priorities = np.abs(np.sum(err,
                                       axis=1)) + self.prioritized_replay_eps

        # 샘플링한 데이터에 대해 새롭게 계산된 TD-error를 업데이트 합니다.
        self.memory.update_priorities(batch_idxes, new_priorities)
Beispiel #7
0
class Agent:
    # @todo: when instantiating two of these, it raises an Exception, because it tries to redefine
    # @todo: the scopes or variables (the names are already taken)
    # @todo: FIX THIS !!!
    """
    We don't use the bundle entropy method to optimize wrt actions,
    but rather plain SGD (or rather Adam)
    """
    def __init__(self, dimO, dimA, beta, layers_dim, finalize_graph=True):
        """
        :param finalize_graph: if you want to restore a model, using .restore(), set this param to False
        """
        self.dimA = dimA
        self.dimO = dimO
        self.beta = beta
        self.layers_dim = layers_dim

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate

        self.opt = self.adam

        self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, FLAGS.alpha)
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True))

        self.noise = np.zeros(self.dimA)
        per_weights = tf.placeholder(tf.float32, [None], 'per_weights')

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        q = -negQ
        act_grad, = tf.gradients(negQ, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            negQ_target = self.negQ(obs_target, act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        q_target = -negQ_target

        y = tf.where(term_target, rew, rew + discount * q_target)
        y = tf.maximum(q - 1., y)
        y = tf.minimum(q + 1., y)
        y = tf.stop_gradient(y)
        print('y shape', y.get_shape())
        print('q shape', q.get_shape())
        td_error = q - y
        print('per weights shape', per_weights.get_shape())
        print('multi td error^2 per weights shape', tf.multiply(tf.square(td_error), per_weights).get_shape())
        ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weights), 0)
        print('ms td error shape', ms_td_error.get_shape())

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + \
                 l2norm * tf.reduce_sum(regLosses) + \
                 FLAGS.alpha_beyond * tf.reduce_sum(
                     tf.where(
                         q > FLAGS.RMAX,
                         tf.square(q - FLAGS.RMAX),
                         tf.zeros((FLAGS.bsize,))) +
                     tf.where(
                         q < FLAGS.RMIN,
                         tf.square(q - FLAGS.RMIN),
                         tf.zeros((FLAGS.bsize,))),
                     0
                 )

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau * (theta_target_i - theta_i))
                         for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'),
                                               self.sess.graph)
        tf.summary.scalar('Qvalue (batch avg)', tf.reduce_mean(q))
        tf.summary.scalar('Qvalue (batch max)', tf.reduce_max(q))
        tf.summary.scalar('Qvalue (batch min)', tf.reduce_min(q))
        tf.summary.scalar('Q targets (batch avg)', tf.reduce_mean(q_target))
        tf.summary.scalar('Q targets (batch min)', tf.reduce_min(q_target))
        tf.summary.scalar('Q targets (batch max)', tf.reduce_max(q_target))
        tf.summary.scalar('loss', ms_td_error)
        tf.summary.scalar('td error', tf.reduce_mean(tf.abs(td_error)))
        tf.summary.scalar('reward', tf.reduce_mean(rew))
        tf.summary.scalar('chosen actions', tf.reduce_mean(act))
        tf.summary.scalar('maximizing action (batch avg)', tf.reduce_mean(act_target))
        tf.summary.scalar('maximizing action (batch max)', tf.reduce_max(act_target))
        tf.summary.scalar('maximizing action (batch min)', tf.reduce_min(act_target))
        merged = tf.summary.merge_all()

        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weights],
                              [optimize_q, update_target, loss_q, tf.abs(td_error), q, q_target],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=100)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                           for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        if finalize_graph:
            self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def adam(self, func, obs, plot=False):
        """Optimizer to find the greedy action"""
        # if npr.random() < 1./20:
        #     plot = True
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None] * 3
        hist = {'act': [], 'f': [], 'g': []}
        for i in range(1000):
            f, g = func(obs, act)
            if plot:
                hist['act'].append(act.copy())
                hist['f'].append(f)
                hist['g'].append(g)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                prev_act_best = act_best.copy()
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]
                a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1))
                a_diff = a_diff_i if a_diff is None \
                    else lam * a_diff + (1. - lam) * a_diff_i
                # print(a_diff_i, a_diff, np.sum(f))
                if a_diff < 1e-3 and i > 5:
                    if plot:
                        self.adam_plot(func, obs, hist)
                    return act_best

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m / (1. - b1t)
            vhat = v / (1. - b2t)

            act -= alpha * mhat / (np.sqrt(v) + eps)
            act = np.clip(act, FLAGS.a_min + 1e-8, FLAGS.a_max - 1e-8)

        print('  + Warning: Adam did not converge.')
        if plot:
            self.adam_plot(func, obs, hist)
        return act_best

    def adam_plot(self, func, obs, hist):
        hist['act'] = np.array(hist['act']).T
        hist['f'] = np.array(hist['f']).T
        hist['g'] = np.array(hist['g']).T
        if self.dimA == 1:
            xs = np.linspace(-1. + 1e-8, 1. - 1e-8, 100)
            ys = [func(obs[[0], :], [[xi]])[0] for xi in xs]
            fig = plt.figure()
            plt.plot(xs, ys)
            plt.plot(hist['act'][0, 0, :], hist['f'][0, :], label='Adam')
            plt.legend()
            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)
        elif self.dimA == 2:
            assert (False)
        else:
            xs = npr.uniform(-1., 1., (5000, self.dimA))
            ys = np.array([func(obs[[0], :], [xi])[0] for xi in xs])
            epi = np.hstack((xs, ys))
            pca = PCA(n_components=2).fit(epi)
            W = pca.components_[:, :-1]
            xs_proj = xs.dot(W.T)
            fig = plt.figure()

            X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100)
            Z = griddata(xs_proj[:, 0], xs_proj[:, 1], ys.ravel(),
                         X, Y, interp='linear')

            plt.contourf(X, Y, Z, 15)
            plt.colorbar()

            adam_x = hist['act'][:, 0, :].T
            adam_x = adam_x.dot(W.T)
            plt.plot(adam_x[:, 0], adam_x[:, 1], label='Adam', color='k')
            plt.legend()

            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)

    def reset(self, obs):
        self.noise = np.zeros(self.dimA)
        self.observation = obs  # initial observation

    def act(self, test=False):
        """
        Greedily choose action
        There is noise during training
        """
        with self.sess.as_default():
            obs = np.expand_dims(self.observation, axis=0)

            f = self._fg

            tflearn.is_training(False)
            action = self.opt(f, obs)
            tflearn.is_training(not test)

            if not test:
                # sig = (self.t < 40000) * (self.t * (FLAGS.ousigma_end - FLAGS.ousigma_start) / 40000 + FLAGS.ousigma_start) + (self.t >= 40000) * FLAGS.ousigma_end
                # self.noise = sig * npr.randn(self.dimA)
                self.noise -= FLAGS.outheta * self.noise - FLAGS.ousigma * npr.randn(self.dimA)
                action += self.noise
            action = np.clip(action, FLAGS.a_min, FLAGS.a_max)

            self.action = np.atleast_1d(np.squeeze(action, axis=0))
            return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:

            self.rm.add(*(obs1, self.action, rew, obs2, term))

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        self.t += 1
        beta = self.beta(self.t)
        with self.sess.as_default():
            obs, act, rew, ob2, term2, w, idx = self.rm.sample(FLAGS.bsize, beta)
            rew, term2, w = rew.squeeze(), term2.squeeze(), w.squeeze()  # fix dimensions
            # w = np.ones(w.shape)  # no prioritization
            f = self._fg_target
            tflearn.is_training(False)
            act2 = self.opt(f, ob2)
            tflearn.is_training(True)

            _, _, loss, td_error, q, q_target = self._train(obs, act, rew, ob2, act2, term2, w,
                                                            log=FLAGS.summary, global_step=self.t)
            self.sess.run(self.proj)  # keep some weights positive
            # self.rm.update_priorities(idx, np.array(td_error.shape[0] * [1.]))  # no prioritization
            self.rm.update_priorities(idx, td_error + 1e-2)
            return loss, td_error, q, q_target

    def negQ(self, x, y, reuse=False):
        """Architecture of the neural network"""
        print('x shape', x.get_shape())
        print('y shape', y.get_shape())
        szs = self.layers_dim
        assert (len(szs) >= 1)
        fc = tflearn.fully_connected
        bn = tflearn.batch_normalization
        lrelu = tflearn.activations.leaky_relu

        if reuse:
            tf.get_variable_scope().reuse_variables()

        nLayers = len(szs)
        us = []
        zs = []
        z_zs = []
        z_ys = []
        z_us = []

        reg = 'L2'

        prevU = x
        for i in range(nLayers):
            with tf.variable_scope('u' + str(i), reuse=reuse) as s:
                u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg)
                if i < nLayers - 1:
                    u = tf.nn.relu(u)
                    if FLAGS.icnn_bn:
                        u = bn(u, reuse=reuse, scope=s, name='bn')
            variable_summaries(u, suffix='u{}'.format(i))
            us.append(u)
            prevU = u

        prevU, prevZ = x, y
        for i in range(nLayers + 1):
            sz = szs[i] if i < nLayers else 1
            z_add = []
            if i > 0:
                with tf.variable_scope('z{}_zu_u'.format(i), reuse=reuse) as s:
                    zu_u = fc(prevU, szs[i - 1], reuse=reuse, scope=s,
                              activation='relu', bias=True,
                              regularizer=reg, bias_init=tf.constant_initializer(1.))
                    variable_summaries(zu_u, suffix='zu_u{}'.format(i))
                with tf.variable_scope('z{}_zu_proj'.format(i), reuse=reuse) as s:
                    z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s,
                              bias=False, regularizer=reg)
                    variable_summaries(z_zu, suffix='z_zu{}'.format(i))
                z_zs.append(z_zu)
                z_add.append(z_zu)

            with tf.variable_scope('z{}_yu_u'.format(i), reuse=reuse) as s:
                yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True,
                          regularizer=reg, bias_init=tf.constant_initializer(1.))
                variable_summaries(yu_u, suffix='yu_u{}'.format(i))
            with tf.variable_scope('z{}_yu'.format(i), reuse=reuse) as s:
                z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False,
                          regularizer=reg)
                z_ys.append(z_yu)
                variable_summaries(z_yu, suffix='z_yu{}'.format(i))
            z_add.append(z_yu)

            with tf.variable_scope('z{}_u'.format(i), reuse=reuse) as s:
                z_u = fc(prevU, sz, reuse=reuse, scope=s,
                         bias=True, regularizer=reg,
                         bias_init=tf.constant_initializer(0.))
                variable_summaries(z_u, suffix='z_u{}'.format(i))
            z_us.append(z_u)
            z_add.append(z_u)

            z = tf.add_n(z_add)
            variable_summaries(z, suffix='z{}_preact'.format(i))
            if i < nLayers:
                # z = tf.nn.relu(z)
                z = lrelu(z, alpha=FLAGS.lrelu)
                variable_summaries(z, suffix='z{}_act'.format(i))

            zs.append(z)
            prevU = us[i] if i < nLayers else None
            prevZ = z

        print('z shape', z.get_shape())
        z = tf.reshape(z, [-1], name='energies')
        return z

    def save(self, path):
        self.saver.save(self.sess, path)

    def restore(self, filename):
        """
        IMPORTANT:
        Filename should be the filepath to the 4 following files:
            - 50314.index
            - 50314.meta
            - 50314.data-00000-of-00001
            - checkpoint
        Note that it shouldn't include any extension. In this case, it would therefore be `tensorboard/models/50314`
        Note that it is `50314` because I used the global training step as a filename to save model

        !!!! BESIDES YOU SHOULD HAVE INSTANTIATED THE AGENT WITH `finalize_graph=False` !!!!
        """
        self.saver = tf.train.import_meta_graph(filename+'.meta')
        self.saver.restore(self.sess, filename)
        self.sess.graph.finalize()

    def __del__(self):
        self.sess.close()
def dist_learn(env,
               q_dist_func,
               num_atoms=51,
               V_max=10,
               lr=25e-5,
               max_timesteps=100000,
               buffer_size=50000,
               exploration_fraction=0.01,
               exploration_final_eps=0.008,
               train_freq=1,
               batch_size=32,
               print_freq=1,
               checkpoint_freq=2000,
               learning_starts=1000,
               gamma=1.0,
               target_network_update_freq=500,
               prioritized_replay=False,
               prioritized_replay_alpha=0.6,
               prioritized_replay_beta0=0.4,
               prioritized_replay_beta_iters=None,
               prioritized_replay_eps=1e-6,
               num_cpu=1,
               callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.single_threaded_session()
    sess.__enter__()

    def make_obs_ph(name):
        print name
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = build_dist_train(
        make_obs_ph=make_obs_ph,
        dist_func=q_dist_func,
        num_actions=env.action_space.n,
        num_atoms=num_atoms,
        V_max=V_max,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    # act, train, update_target, debug = build_train(
    #     make_obs_ph=make_obs_ph,
    #     q_func=q_func,
    #     num_actions=env.action_space.n,
    #     optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #     gamma=gamma,
    #     grad_norm_clipping=10
    # )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_dist_func': q_dist_func,
        'num_actions': env.action_space.n,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        print model_file
        # mkdir_p(os.path.dirname(model_file))
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None],
                         update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    # print "CCCC"
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                # print "Come1"
                # print np.shape(obses_t), np.shape(actions), np.shape(rewards), np.shape(obses_tp1), np.shape(dones)
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                # print "Loss : {}".format(td_errors)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                print "steps : {}".format(t)
                print "episodes : {}".format(num_episodes)
                print "mean 100 episode reward: {}".format(mean_100ep_reward)
                # print "mean 100 episode reward".format(mean_100ep_reward)
                # logger.record_tabular("episodes", num_episodes)
                # logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.dump_tabular()
                # logger.record_tabular("steps", t)
                # logger.record_tabular("episodes", num_episodes)
                # logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and t % checkpoint_freq == 0):
                print "=========================="
                print "Error: {}".format(td_errors)
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        print "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward, mean_100ep_reward)
                        # logger.log("Saving model due to mean reward increase: {} -> {}".format(
                        #            saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                print "Restored model with mean reward: {}".format(
                    saved_mean_reward)
                # logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")

    # Dictionary-based value function
    q_func_tabular = {}
    defaultQValue = np.ones(env.action_space.n)

    # Given an integer, return the corresponding boolean array
    def getBoolBits(state):
        return np.unpackbits(np.uint8(state), axis=1) == 1

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else defaultQValue
            for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.1
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[
                    keys[i]] + alpha * qCurrTargets[i]


#                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    max_timesteps = 200000
    exploration_fraction = 0.3
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 100
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    valueFunctionType = "TABULAR"
    #    valueFunctionType = "DQN"

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Set up replay buffer
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    state = env.reset()

    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        #        np.unpackbits(np.uint8(np.reshape(states_tp1,[batch_size,1])),axis=1)
        qCurr = getTabular(getBoolBits([[state]]))

        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly

        # select action at random
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        nextState, rew, done, _ = env.step(action)

        replay_buffer.add(state, action, rew, nextState, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actions, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            qNext = getTabular(
                getBoolBits(np.reshape(states_tp1, [batch_size, 1])))

            qNextmax = np.max(qNext, axis=1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTarget = getTabular(
                getBoolBits(np.reshape(states_t, [batch_size, 1])))

            td_error = qCurrTarget[range(batch_size), actions] - targets
            qCurrTarget[range(batch_size), actions] = targets

            trainTabular(getBoolBits(np.reshape(states_t, [batch_size, 1])),
                         qCurrTarget)

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        state = np.copy(nextState)
Beispiel #10
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    num_episodes,
                    experiment_dir,
                    replay_buffer_size=500000,
                    replay_buffer_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    prioritized_replay_alpha=0.6,
                    prioritized_replay_beta0=0.4,
                    prioritized_replay_beta_iters=500000,
                    prioritized_replay_eps=1e-6):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The replay buffer

    replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                            alpha=prioritized_replay_alpha)
    beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                   initial_p=prioritized_replay_beta0,
                                   final_p=1.0)
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  episode_transbag=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    # monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
        copy_model_parameters(sess, q_estimator, target_estimator)
        print("\nCopied model parameters to target network.")
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, env.action_space.n)

    # Populate the replay buffer with initial experience
    print("Populating replay buffer...")
    state = env.reset_test()
    for i in range(replay_buffer_init_size):
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.add(state, action, reward, next_state, done)

        if i % 1000 == 0:
            print("\r{} in {} ".format(i, replay_buffer_init_size), end="")
            sys.stdout.flush()
            state = env.reset_test()
        else:
            state = next_state

    # Record videos
    # Use the gym env Monitor wrapper
    # env = Monitor(env,
    #               directory=monitor_path,
    #               resume=True,
    #               video_callable=lambda count: count % record_video_every ==0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset_test()
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, data_overflow = env.step(action)

            # Save transition to replay buffer
            replay_buffer.add(state, action, reward, next_state, done)
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            stats.episode_transbag[i_episode] += data_overflow
            # Sample a minibatch from the replay buffer
            experience = replay_buffer.sample(
                batch_size, beta=beta_schedule.value(total_t))
            (states_batch, action_batch, reward_batch, next_states_batch,
             done_batch, weights_batch, batch_idxes) = experience

            # Calculate q values and targets (Double DQN)
            q_values_next = q_estimator.predict(sess, next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(
                sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions]

            # Perform gradient descent update
            loss, td_error = q_estimator.update(sess, states_batch,
                                                action_batch, targets_batch,
                                                weights_batch)

            new_priorities = np.abs(td_error) + prioritized_replay_eps
            replay_buffer.update_priorities(batch_idxes, new_priorities)

            # if done:
            #     break
            if t >= 1000:
                break
            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            node_name="episode_reward",
            tag="episode_reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            node_name="episode_length",
            tag="episode_length")
        episode_summary.value.add(
            simple_value=stats.episode_transbag[i_episode],
            node_name="episode_transbag",
            tag="episode_transbag")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1],
            episode_transbag=stats.episode_transbag[:i_episode + 1])

    #env.monitor.close()
    q_estimator.summary_writer.add_graph(sess.graph)
    return stats
class PrioritizedDQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 update_every,
                 update_mem_every,
                 update_mem_par_every,
                 experience_per_sampling,
                 seed=25,
                 epsilon=1,
                 epsilon_min=0.01,
                 eps_decay=0.999,
                 compute_weights=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.experience_per_sampling = experience_per_sampling
        self.update_mem_every = update_mem_every
        self.update_mem_par_every = update_mem_par_every
        self.seed = random.seed(seed)
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay
        self.compute_weights = compute_weights

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.scheduler = StepLR(self.optimizer, step_size=1, gamma=0.995)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(self.action_size,
                                              self.buffer_size,
                                              self.batch_size,
                                              self.experience_per_sampling,
                                              self.seed, self.compute_weights)
        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % self.update_every
        self.t_step_mem = (self.t_step_mem + 1) % self.update_mem_every
        self.t_step_mem_par = (self.t_step_mem_par +
                               1) % self.update_mem_par_every
        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.experience_count > self.experience_per_sampling:
                sampling = self.memory.sample()
                self.learn(sampling)
        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, state):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
        """
        self.epsilon = max(self.epsilon * self.eps_decay, self.epsilon_min)

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
            #print(action_values)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > self.epsilon:
            #print(np.argmax(action_values.cpu().data.numpy()))
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, sampling):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, weights, indices = sampling

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        if self.compute_weights:
            with torch.no_grad():
                weight = sum(np.multiply(weights, loss.data.cpu().numpy()))
            loss *= weight

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        self.learn_steps += 1

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

        # ------------------- update priorities ------------------- #
        delta = abs(Q_targets - Q_expected.detach()).numpy()
        self.memory.update_priorities(delta, indices)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Beispiel #12
0
                losses.register_hook(lambda x: x * torch.as_tensor(
                    weights, device=device, dtype=torch.float32))
            final_loss = losses.mean()
            # weighted_losses = losses * torch.as_tensor(weights, device=device) if use_per else losses
            # final_loss = weighted_losses.mean()
            optimizer.zero_grad()
            final_loss.backward()
            if dueling:
                torch.nn.utils.clip_grad_norm_(behavior_model.parameters(),
                                               grad_norm)
            optimizer.step()

            if use_per:
                td_error = (labels - predictions).detach().cpu().numpy()
                memory.update_priorities(
                    idxes,
                    np.abs(td_error + 0.001 * np.min(td_error[td_error != 0])))

    if episode == 0 or episode % 10 == 0:
        print(
            f'finished episode {episode} at timestep {step_counter} with reward {tot_reward}'
        )
        # logging.info(f'finished episode {episode} at timestep {step_counter} with reward {tot_reward}')
        with open('cache/rewards.pkl', 'wb') as f:
            pickle.dump(rewards, f)
    if episode % 200 == 0:
        torch.save(behavior_model, f'models/model_{episode}.pt')
    rewards.append(tot_reward)
    writer.add_scalar("Reward/Episode", tot_reward, episode)
    writer.add_scalar("Reward/Timestep", tot_reward, step_counter)
    writer.add_scalar("Epsilon/Episode", epsilon, episode)
Beispiel #13
0
class DQNAgent(object):
    def __init__(
        self,
        stateShape,
        actionSpace,
        numPicks,
        memorySize,
        numRewards,
        sync=50,
        burnin=0,  #500,
        alpha=0.0001,
        epsilon=1,
        epsilon_decay=0.9995,
        epsilon_min=0.01,
        gamma=0.99,
    ):
        self.numPicks = numPicks
        self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6)
        self.stateShape = stateShape
        self.actionSpace = actionSpace

        self.step = 0

        self.sync = sync
        self.burnin = burnin
        self.alpha = alpha
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.gamma = gamma

        self.walpha = 0.01
        self.delay = 1

        self.numRewards = numRewards

        self.trainNetwork = self.createNetwork(stateShape, len(actionSpace),
                                               self.alpha)
        self.targetNetwork = self.createNetwork(stateShape, len(actionSpace),
                                                self.alpha)
        self.targetNetwork.set_weights(self.trainNetwork.get_weights())

    def createNetwork(self, n_input, n_output, learningRate):
        model = keras.models.Sequential()

        model.add(
            keras.layers.experimental.preprocessing.Rescaling(
                1.0 / 255, input_shape=n_input))
        model.add(
            keras.layers.Conv2D(32,
                                kernel_size=8,
                                strides=4,
                                activation="relu"))
        model.add(
            keras.layers.Conv2D(64,
                                kernel_size=4,
                                strides=2,
                                activation="relu"))
        model.add(
            keras.layers.Conv2D(64,
                                kernel_size=3,
                                strides=1,
                                activation="relu"))
        model.add(keras.layers.Flatten())
        model.add(keras.layers.Dense(512, activation="linear"))
        model.add(keras.layers.Dense(n_output, activation="linear"))

        model.compile(loss=keras.losses.Huber(),
                      optimizer=keras.optimizers.Adam(lr=learningRate))
        print(model.summary())
        return model

    def trainDQN(self):
        if self.step <= self.numPicks or len(self.replayMemory) <= self.burnin:
            return 0

        self.beta = 0.4 + self.step * (1.0 - 0.4) / 30000
        samples = self.replayMemory.sample(self.numPicks, self.beta)
        currStates, actions, rewards, nextStates, dones, weights, indices = samples

        currStates = np.array(currStates).transpose(0, 2, 3, 1)
        Q_currents = self.trainNetwork(currStates, training=False).numpy()

        nextStates = np.array(nextStates).transpose(0, 2, 3, 1)
        Q_futures = self.targetNetwork(nextStates,
                                       training=False).numpy().max(axis=1)

        rewards = (np.array(rewards).reshape(self.numPicks, ).astype(float))
        actions = (np.array(actions).reshape(self.numPicks, ).astype(int))

        dones = np.squeeze(np.array(dones)).astype(bool)
        notDones = (~dones).astype(float)
        dones = dones.astype(float)

        Q_currents_cp = deepcopy(Q_currents)
        Q_currents_cp[np.arange(self.numPicks),
                      actions] = (rewards + Q_futures * self.gamma * notDones)

        h = tf.keras.losses.Huber()
        loss = h(
            Q_currents[np.arange(self.numPicks), actions],
            Q_currents_cp[np.arange(self.numPicks), actions],
        )
        prios = (np.abs(loss) * weights) + 1e-5
        self.replayMemory.update_priorities(indices, prios)

        loss = self.trainNetwork.train_on_batch(currStates, Q_currents_cp)
        return loss

    def selectAction(self, state):
        self.step += 1
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

        q = -100000
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, 3)
        else:
            preds = np.squeeze(
                self.trainNetwork(
                    np.expand_dims(np.array(state).transpose(1, 2, 0), 0),
                    training=False,
                ).numpy(),
                axis=0,
            )
            action = np.argmax(preds)
            q = preds[action]
        return action, q

    def addMemory(self, state, action, reward, nextState, done):
        self.replayMemory.add(state, action, reward, nextState, done)

    def save(self):
        save_path = f"./dst_net_{int(self.step)}.chkpt"
        train_w = self.trainNetwork.get_weights()
        target_w = self.trainNetwork.get_weights()

        with open(save_path, "wb") as f:
            pickle.dump([train_w, target_w], f)

        print(f"DSTNet saved to {save_path} done!")

    def load(self):
        save_path = "./dst_net_mixed.chkpt"
        with open(save_path, "rb") as f:
            weights = pickle.load(f)

        self.trainNetwork.set_weights(weights[0])
        self.trainNetwork.set_weights(weights[1])
Beispiel #14
0
def learn(env_id,
          q_func,
          lr=5e-4,
          max_timesteps=10000,
          buffer_size=5000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          train_steps=10,
          learning_starts=500,
          batch_size=32,
          print_freq=10,
          checkpoint_freq=100,
          model_dir=None,
          gamma=1.0,
          target_network_update_freq=50,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          player_processes=None,
          player_connections=None):
    env, _, _ = create_gvgai_environment(env_id)

    # Create all the functions necessary to train the model
    # expert_decision_maker = ExpertDecisionMaker(env=env)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    session = tf.Session()
    session.__enter__()
    policy_path = os.path.join(model_dir, "Policy.pkl")
    model_path = os.path.join(model_dir, "model", "model")
    if os.path.isdir(os.path.join(model_dir, "model")):
        load_state(model_path)
    else:
        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': q_func,
            'num_actions': env.action_space.n,
        }
        act = ActWrapper(act, act_params)
        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()
        act.save(policy_path)
        save_state(model_path)
    env.close()
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer_path = os.path.join(model_dir, "Prioritized_replay.pkl")
        if os.path.isfile(replay_buffer_path):
            with open(replay_buffer_path, 'rb') as input_file:
                replay_buffer = pickle.load(input_file)
        else:
            replay_buffer = PrioritizedReplayBuffer(
                buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer_path = os.path.join(model_dir, "Normal_replay.pkl")
        if os.path.isfile(replay_buffer_path):
            with open(replay_buffer_path, 'rb') as input_file:
                replay_buffer = pickle.load(input_file)
        else:
            replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    episode_rewards = list()
    saved_mean_reward = -999999999

    signal.signal(signal.SIGQUIT, signal_handler)
    global terminate_learning

    total_timesteps = 0
    for timestep in range(max_timesteps):
        if terminate_learning:
            break

        for connection in player_connections:
            experiences, reward = connection.recv()
            episode_rewards.append(reward)
            for experience in experiences:
                replay_buffer.add(*experience)
                total_timesteps += 1

        if total_timesteps < learning_starts:
            if timestep % 10 == 0:
                print("not strated yet", flush=True)
            continue

        if timestep % train_freq == 0:
            for i in range(train_steps):
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(total_timesteps))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

        if timestep % target_network_update_freq == 0:
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if print_freq is not None and timestep % print_freq == 0:
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
            logger.record_tabular(
                "% time spent exploring",
                int(100 * exploration.value(total_timesteps)))
            logger.dump_tabular()

        if timestep % checkpoint_freq == 0 and mean_100ep_reward > saved_mean_reward:
            act.save(policy_path)
            save_state(model_path)
            saved_mean_reward = mean_100ep_reward
            with open(replay_buffer_path, 'wb') as output_file:
                pickle.dump(replay_buffer, output_file,
                            pickle.HIGHEST_PROTOCOL)
            send_message_to_all(player_connections, Message.UPDATE)

    send_message_to_all(player_connections, Message.TERMINATE)
    if mean_100ep_reward > saved_mean_reward:
        act.save(policy_path)
    with open(replay_buffer_path, 'wb') as output_file:
        pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL)
    for player_process in player_processes:
        player_process.join()
        # player_process.terminate()

    return act.load(policy_path)
Beispiel #15
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=100000,
          exploration_fraction=0.1,
          exploration_final_eps=0.1,
          train_freq=1,
          batch_size=64,
          print_freq=1,
          eval_freq=2500,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          csv_path="results.csv",
          method_type="baseline",
          **network_kwargs):
    """Train a deepr model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepr.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepr/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    #q_func = build_q_func(network, **network_kwargs)
    q_func = build_q_func(mlp(num_layers=4, num_hidden=64), **network_kwargs)
    #q_func = build_q_func(mlp(num_layers=2, num_hidden=64, activation=tf.nn.relu), **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * total_timesteps),
        #initial_p=1.0,
        initial_p=exploration_final_eps,
        final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    eval_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        csvfile = open(csv_path, 'w', newline='')
        fieldnames = ['STEPS', 'REWARD']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for t in range(total_timesteps + 1):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                #update_eps = exploration.value(t)
                update_eps = exploration_final_eps
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            action_mask = get_mask(env, method_type)
            a = act(np.array(obs)[None],
                    unused_actions_neginf_mask=action_mask,
                    update_eps=update_eps,
                    **kwargs)[0]

            env_action = a
            reset = False
            new_obs, rew, done, _ = env.step(env_action)

            eval_rewards[-1] += rew

            action_mask_p = get_mask(env, method_type)
            # Shaping
            if method_type == 'shaping':

                ## look-ahead shaping
                ap = act(np.array(new_obs)[None],
                         unused_actions_neginf_mask=action_mask_p,
                         stochastic=False)[0]
                f = action_mask_p[ap] - action_mask[a]
                rew = rew + f

            # Store transition in the replay buffer.
            #replay_buffer.add(obs, a, rew, new_obs, float(done), action_mask_p)
            if method_type != 'shaping':
                replay_buffer.add(obs, a, rew, new_obs, float(done),
                                  np.zeros(env.action_space.n))
            else:
                replay_buffer.add(obs, a, rew, new_obs, float(done),
                                  action_mask_p)
            obs = new_obs

            if t % eval_freq == 0:
                eval_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones, masks_tp1 = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights, masks_tp1)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_eval_reward = round(np.mean(eval_rewards[-1 - print_freq:-1]),
                                     1)
            num_evals = len(eval_rewards)
            if t > 0 and t % eval_freq == 0 and print_freq is not None and t % (
                    print_freq * eval_freq) == 0:
                #if done and print_freq is not None and len(eval_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("evals", num_evals)
                logger.record_tabular("average reward in this eval",
                                      mean_eval_reward / (eval_freq))
                logger.record_tabular("total reward in this eval",
                                      mean_eval_reward)
                logger.dump_tabular()

                writer.writerow({
                    "STEPS": t,
                    "REWARD": mean_eval_reward / (eval_freq)
                })
                csvfile.flush()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_evals > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_eval_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_eval_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_eval_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_variables(model_file)

    return act
Beispiel #16
0
def main():

    #    env = gym.make("CartPoleRob-v0")
    #    env = gym.make("CartPole-v0")
    #    env = gym.make("CartPole-v1")
    #    env = gym.make("Acrobot-v1")
    #    env = gym.make("MountainCarRob-v0")
    #    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    #    env = gym.make("FrozenLake8x8rob-v0")
    #    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen + 1):
            for j in range(np.shape(obses_t)[1] - windowLen + 1):
                deicticObses_t.append(obses_t[i:i + windowLen,
                                              j:j + windowLen, :])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen + 1):
                for k in range(np.shape(obses_t)[2] - windowLen + 1):
                    deicticObses_t.append(obses_t[i, j:j + windowLen,
                                                  k:k + windowLen, :])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i, j:j + windowLen,
                                                      k:k + windowLen, :])
                    deicticWeights.append(weights[i])

        return np.array(deicticObses_t), np.array(deicticActions), np.array(
            deicticObses_tp1), np.array(deicticWeights)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
        #        hiddens=[256],  # used in pong
        #        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        #        convs=[(8,3,1)], # used for deictic TestRob3-v0
        convs=[(16, 3, 1)],  # used for deictic TestRob3-v0
        #        convs=[(4,3,1)], # used for deictic TestRob3-v0
        #        convs=[(16,3,1)], # used for deictic TestRob3-v0
        #        convs=[(8,2,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True)

    #    model = models.mlp([6])

    # parameters
    q_func = model
    lr = 1e-3
    #    lr=1e-4
    #    max_timesteps=100000
    #    max_timesteps=50000
    max_timesteps = 20000
    buffer_size = 50000
    #    exploration_fraction=0.1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    #    exploration_final_eps=0.005
    #    exploration_final_eps=0.1
    print_freq = 10
    checkpoint_freq = 10000
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    prioritized_replay = False
    #    prioritized_replay=True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    batch_size=32
    #    train_freq=1
    #    batch_size=64
    #    train_freq=2
    #    batch_size=128
    #    train_freq=4
    #    batch_size=256
    #    train_freq=4
    batch_size = 512
    train_freq = 8

    # deicticShape must be square.
    # These two parameters need to be consistent w/ each other.
    #    deicticShape = (2,2,1)
    #    num_deictic_patches=36
    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    #    deicticShape = (4,4,1)
    #    num_deictic_patches=25
    #    deicticShape = (5,5,1)
    #    num_deictic_patches=16
    #    deicticShape = (6,6,1)
    #    num_deictic_patches=9
    #    deicticShape = (7,7,1)
    #    num_deictic_patches=4
    #    deicticShape = (8,8,1)
    #    num_deictic_patches=1

    def make_obs_ph(name):
        #        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size * 25, )

    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    #    act, train, update_target, debug = build_graph.build_train(
    #    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    #    getq, train, trainWOUpdate, debug = build_graph.build_train_deictic(
    #    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
        make_obs_ph=make_obs_ph,
        make_match_ph=make_match_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        batch_size=batch_size,
        num_deictic_patches=num_deictic_patches,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        double_q=False)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()

    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # get action to take
        #        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        #        qvalues = getq(np.array(obs)[None])
        #        action = np.argmax(qvalues)
        #        if np.random.rand() < exploration.value(t):
        #            action = np.random.randint(env.action_space.n)

        deicticObs = getDeicticObs(obs, deicticShape[0])
        qvalues = getq(np.array(deicticObs))
        action = np.argmax(np.max(qvalues, 0))
        selPatch = np.argmax(np.max(qvalues, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
#        env.render()

        new_obs, rew, done, _ = env.step(action)

        # display state, action, nextstate
        if t > 20000:
            toDisplay = np.reshape(new_obs, (8, 8))
            toDisplay[
                np.
                int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))),
                np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches))
                         )] = 50
            print(
                "Current/next state. 50 denotes the upper left corner of the deictic patch."
            )
            print(str(toDisplay))


#        env.render()

# Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
            if t > 20000:
                print("q-values:")
                print(str(qvalues))
                print("*** Episode over! ***\n\n")

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(
                obses_t, actions, obses_tp1, weights, deicticShape[0])

            obses_t_deic_fingerprints = [
                np.reshape(obses_t_deic[i],
                           [deicticShape[0] * deicticShape[1]])
                for i in range(np.shape(obses_t_deic)[0])
            ]
            _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,
                                               axis=0,
                                               return_index=True,
                                               return_inverse=True)
            #            matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)]

            #            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
            #            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
            #            debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
            #            debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            #            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            #            td_errors2, min_values_of_groups2, match_onehot2 = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)

            td_errors, min_values_of_groups, match_onehot = train(
                obses_t_deic, actions_deic, rewards, obses_tp1_deic,
                fingerprintMatch, dones, weights_deic)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))

            if t > learning_starts and t % train_freq == 0:
                group_counts = np.sum(match_onehot, 1)
                print(str(min_values_of_groups[min_values_of_groups < 1000]))
                #                print(str(min_values_of_groups2[min_values_of_groups2 < 1000]))
                print(str(group_counts[group_counts > 0]))

                # display one of most valuable deictic patches
                min_values_of_groups_trunc = min_values_of_groups[
                    min_values_of_groups < 1000]
                most_valuable_patches_idx = np.argmax(
                    min_values_of_groups_trunc)
                most_valuable_patches = obses_t_deic[fingerprintMatch ==
                                                     most_valuable_patches_idx]
                print(
                    str(np.reshape(most_valuable_patches[0],
                                   deicticShape[0:2])))
                print(
                    "value of most valuable patch: " +
                    str(min_values_of_groups_trunc[most_valuable_patches_idx]))
                print("sum group counts: " + str(np.sum(group_counts)))

    num2avg = 20
    rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg
    plt.plot(rListAvg)
    #    plt.plot(episode_rewards)
    plt.show()

    sess
Beispiel #17
0
def learn(env,
          network,
          seed=None,
          lr=5e-5,
          total_timesteps=100000,
          buffer_size=500000,
          exploration_fraction=0.1,
          exploration_final_eps=0.01,
          train_freq=1,
          batch_size=32,
          print_freq=10,
          checkpoint_freq=100000,
          checkpoint_path=None,
          learning_starts=0,
          gamma=0.99,
          target_network_update_freq=10000,
          prioritized_replay=True,
          prioritized_replay_alpha=0.4,
          prioritized_replay_beta0=0.6,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-3,
          param_noise=False,
          callback=None,
          load_path=None,
          load_idx=None,
          demo_path=None,
          n_step=10,
          demo_prioritized_replay_eps=1.0,
          pre_train_timesteps=750000,
          epsilon_schedule="constant",
          **network_kwargs):
    # Create all the functions necessary to train the model
    set_global_seeds(seed)
    q_func = build_q_func(network, **network_kwargs)

    with tf.device('/GPU:0'):
        model = DQfD(q_func=q_func,
                     observation_shape=env.observation_space.shape,
                     num_actions=env.action_space.n,
                     lr=lr,
                     grad_norm_clipping=10,
                     gamma=gamma,
                     param_noise=param_noise)

    # Load model from checkpoint
    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=model)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        if load_idx is None:
            ckpt.restore(manager.latest_checkpoint)
            print("Restoring from {}".format(manager.latest_checkpoint))
        else:
            ckpt.restore(manager.checkpoints[load_idx])
            print("Restoring from {}".format(manager.checkpoints[load_idx]))

    # Setup demo trajectory
    assert demo_path is not None
    with open(demo_path, "rb") as f:
        trajectories = pickle.load(f)

    # Create the replay buffer
    replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                            prioritized_replay_alpha)
    if prioritized_replay_beta_iters is None:
        prioritized_replay_beta_iters = total_timesteps
    beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                   initial_p=prioritized_replay_beta0,
                                   final_p=1.0)
    temp_buffer = deque(maxlen=n_step)
    is_demo = True
    for epi in trajectories:
        for obs, action, rew, new_obs, done in epi:
            obs, new_obs = np.expand_dims(
                np.array(obs), axis=0), np.expand_dims(np.array(new_obs),
                                                       axis=0)
            if n_step:
                temp_buffer.append((obs, action, rew, new_obs, done, is_demo))
                if len(temp_buffer) == n_step:
                    n_step_sample = get_n_step_sample(temp_buffer, gamma)
                    replay_buffer.demo_len += 1
                    replay_buffer.add(*n_step_sample)
            else:
                replay_buffer.demo_len += 1
                replay_buffer.add(obs[0], action, rew, new_obs[0], float(done),
                                  float(is_demo))
    logger.log("trajectory length:", replay_buffer.demo_len)
    # Create the schedule for exploration
    if epsilon_schedule == "constant":
        exploration = ConstantSchedule(exploration_final_eps)
    else:  # not used
        exploration = LinearSchedule(schedule_timesteps=int(
            exploration_fraction * total_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)

    model.update_target()

    # ============================================== pre-training ======================================================
    start = time()
    num_episodes = 0
    temp_buffer = deque(maxlen=n_step)
    for t in tqdm(range(pre_train_timesteps)):
        # sample and train
        experience = replay_buffer.sample(batch_size,
                                          beta=prioritized_replay_beta0)
        batch_idxes = experience[-1]
        if experience[6] is None:  # for n_step = 0
            obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple(
                map(tf.constant, experience[:6]))
            obses_tpn, rewards_n, dones_n = None, None, None
            weights = tf.constant(experience[-2])
        else:
            obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple(
                map(tf.constant, experience[:-1]))
        td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train(
            obses_t, actions, rewards, obses_tp1, dones, is_demos, weights,
            obses_tpn, rewards_n, dones_n)

        # Update priorities
        new_priorities = np.abs(td_errors) + np.abs(
            n_td_errors) + demo_prioritized_replay_eps
        replay_buffer.update_priorities(batch_idxes, new_priorities)

        # Update target network periodically
        if t > 0 and t % target_network_update_freq == 0:
            model.update_target()

        # Logging
        elapsed_time = timedelta(time() - start)
        if print_freq is not None and t % 10000 == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", 0)
            logger.record_tabular("max 100 episode reward", 0)
            logger.record_tabular("min 100 episode reward", 0)
            logger.record_tabular("demo sample rate", 1)
            logger.record_tabular("epsilon", 0)
            logger.record_tabular("loss_td", np.mean(loss_dq.numpy()))
            logger.record_tabular("loss_n_td", np.mean(loss_n.numpy()))
            logger.record_tabular("loss_margin", np.mean(loss_E.numpy()))
            logger.record_tabular("loss_l2", np.mean(loss_l2.numpy()))
            logger.record_tabular("losses_all", weighted_error.numpy())
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("pre_train", True)
            logger.record_tabular("elapsed time", elapsed_time)
            logger.dump_tabular()

    # ============================================== exploring =========================================================
    sample_counts = 0
    demo_used_counts = 0
    episode_rewards = deque(maxlen=100)
    this_episode_reward = 0.
    best_score = 0.
    saved_mean_reward = None
    is_demo = False
    obs = env.reset()
    # Always mimic the vectorized env
    obs = np.expand_dims(np.array(obs), axis=0)
    reset = True
    for t in tqdm(range(total_timesteps)):
        if callback is not None:
            if callback(locals(), globals()):
                break
        kwargs = {}
        if not param_noise:
            update_eps = tf.constant(exploration.value(t))
            update_param_noise_threshold = 0.
        else:  # not used
            update_eps = tf.constant(0.)
            update_param_noise_threshold = -np.log(1. - exploration.value(t) +
                                                   exploration.value(t) /
                                                   float(env.action_space.n))
            kwargs['reset'] = reset
            kwargs[
                'update_param_noise_threshold'] = update_param_noise_threshold
            kwargs['update_param_noise_scale'] = True
        action, epsilon, _, _ = model.step(tf.constant(obs),
                                           update_eps=update_eps,
                                           **kwargs)
        action = action[0].numpy()
        reset = False
        new_obs, rew, done, _ = env.step(action)

        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        if n_step:
            temp_buffer.append((obs, action, rew, new_obs, done, is_demo))
            if len(temp_buffer) == n_step:
                n_step_sample = get_n_step_sample(temp_buffer, gamma)
                replay_buffer.add(*n_step_sample)
        else:
            replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), 0.)
        obs = new_obs

        # invert log scaled score for logging
        this_episode_reward += np.sign(rew) * (np.exp(np.sign(rew) * rew) - 1.)
        if done:
            num_episodes += 1
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(this_episode_reward)
            reset = True
            if this_episode_reward > best_score:
                best_score = this_episode_reward
                ckpt = tf.train.Checkpoint(model=model)
                manager = tf.train.CheckpointManager(ckpt,
                                                     './best_model',
                                                     max_to_keep=1)
                manager.save(t)
                logger.log("saved best model")
            this_episode_reward = 0.0

        if t % train_freq == 0:
            experience = replay_buffer.sample(batch_size,
                                              beta=beta_schedule.value(t))
            batch_idxes = experience[-1]
            if experience[6] is None:  # for n_step = 0
                obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple(
                    map(tf.constant, experience[:6]))
                obses_tpn, rewards_n, dones_n = None, None, None
                weights = tf.constant(experience[-2])
            else:
                obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple(
                    map(tf.constant, experience[:-1]))
            td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train(
                obses_t, actions, rewards, obses_tp1, dones, is_demos, weights,
                obses_tpn, rewards_n, dones_n)
            new_priorities = np.abs(td_errors) + np.abs(
                n_td_errors
            ) + demo_prioritized_replay_eps * is_demos + prioritized_replay_eps * (
                1. - is_demos)
            replay_buffer.update_priorities(batch_idxes, new_priorities)

            # for logging
            sample_counts += batch_size
            demo_used_counts += np.sum(is_demos)

        if t % target_network_update_freq == 0:
            # Update target network periodically.
            model.update_target()

        if t % checkpoint_freq == 0:
            save_path = checkpoint_path
            ckpt = tf.train.Checkpoint(model=model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 save_path,
                                                 max_to_keep=10)
            manager.save(t)
            logger.log("saved checkpoint")

        elapsed_time = timedelta(time() - start)
        if done and num_episodes > 0 and num_episodes % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward",
                                  np.mean(episode_rewards))
            logger.record_tabular("max 100 episode reward",
                                  np.max(episode_rewards))
            logger.record_tabular("min 100 episode reward",
                                  np.min(episode_rewards))
            logger.record_tabular("demo sample rate",
                                  demo_used_counts / sample_counts)
            logger.record_tabular("epsilon", epsilon.numpy())
            logger.record_tabular("loss_td", np.mean(loss_dq.numpy()))
            logger.record_tabular("loss_n_td", np.mean(loss_n.numpy()))
            logger.record_tabular("loss_margin", np.mean(loss_E.numpy()))
            logger.record_tabular("loss_l2", np.mean(loss_l2.numpy()))
            logger.record_tabular("losses_all", weighted_error.numpy())
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("pre_train", False)
            logger.record_tabular("elapsed time", elapsed_time)
            logger.dump_tabular()

    return model
Beispiel #18
0
class DDPG(DRL):
    """
    Deep Deterministic Policy Gradient
    """
    def __init__(self, env):
        super(DDPG, self).__init__()

        self.sess = K.get_session()

        self.env = env
        self.upper_bound = self.env.action_space.high[0]
        self.lower_bound = self.env.action_space.low[0]

        # update rate for target model.
        # for 2nd round training, use 0.000001
        self.TAU = 0.00001

        # learning rate for actor and critic
        # for 2nd round training, use 1e-5
        self.actor_lr = 1e-4
        self.critic_lr = 1e-4

        # risk averse constant
        self.ra_c = 1.5

        # actor: policy function
        # critic: Q functions; Q_ex, Q_ex2, and Q
        self.actor = self._build_actor(learning_rate=self.actor_lr)
        self.critic_Q_ex, self.critic_Q_ex2, self.critic_Q = self._build_critic(
            learning_rate=self.critic_lr)

        self.critic_Q.summary()

        # target networks for actor and three critics
        self.actor_hat = self._build_actor(learning_rate=self.actor_lr)
        self.actor_hat.set_weights(self.actor.get_weights())

        self.critic_Q_ex_hat, self.critic_Q_ex2_hat, self.critic_Q_hat = self._build_critic(
            learning_rate=self.critic_lr)
        self.critic_Q_ex_hat.set_weights(self.critic_Q_ex.get_weights())
        self.critic_Q_ex2_hat.set_weights(self.critic_Q_ex2.get_weights())

        # epsilon of epsilon-greedy
        self.epsilon = 1.0

        # discount rate for epsilon
        self.epsilon_decay = 0.99994
        # self.epsilon_decay = 0.9994

        # min epsilon of epsilon-greedy.
        self.epsilon_min = 0.1

        # memory buffer for experience replay
        buffer_size = 600000
        prioritized_replay_alpha = 0.6
        self.replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        prioritized_replay_beta0 = 0.4

        # need not be the same as training episode (see schedules.py)
        prioritized_replay_beta_iters = 50001

        self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                            initial_p=prioritized_replay_beta0,
                                            final_p=1.0)

        # for numerical stabiligy
        self.prioritized_replay_eps = 1e-6

        self.t = None

        # memory sample batch size
        self.batch_size = 128

        # may use for 2nd round training
        # self.policy_noise = 5
        # self.noise_clip = 5

        # gradient function
        self.get_critic_grad = self.critic_gradient()
        self.actor_optimizer()

    def load(self, tag=""):
        """load two Qs for test"""
        if tag == "":
            actor_file = "model/ddpg_actor.h5"
            critic_Q_ex_file = "model/ddpg_critic_Q_ex.h5"
            critic_Q_ex2_file = "model/ddpg_critic_Q_ex2.h5"
        else:
            actor_file = "model/ddpg_actor_" + tag + ".h5"
            critic_Q_ex_file = "model/ddpg_critic_Q_ex_" + tag + ".h5"
            critic_Q_ex2_file = "model/ddpg_critic_Q_ex2_" + tag + ".h5"

        if os.path.exists(actor_file):
            self.actor.load_weights(actor_file)
            self.actor_hat.load_weights(actor_file)
        if os.path.exists(critic_Q_ex_file):
            self.critic_Q_ex.load_weights(critic_Q_ex_file)
            self.critic_Q_ex_hat.load_weights(critic_Q_ex_file)
        if os.path.exists(critic_Q_ex2_file):
            self.critic_Q_ex2.load_weights(critic_Q_ex2_file)
            self.critic_Q_ex2_hat.load_weights(critic_Q_ex2_file)

    def _build_actor(self, learning_rate=1e-3):
        """basic NN model.
        """
        inputs = Input(shape=(self.env.num_state, ))

        # bn after input
        x = BatchNormalization()(inputs)

        # bn after activation
        x = Dense(32, activation="relu")(x)
        x = BatchNormalization()(x)

        x = Dense(64, activation="relu")(x)
        x = BatchNormalization()(x)

        # no bn for output layer
        x = Dense(1, activation="sigmoid")(x)

        output = Lambda(lambda x: x * self.env.num_contract * 100)(x)

        model = Model(inputs=inputs, outputs=output)

        # compile the model using mse loss, but won't use mse to train
        model.compile(loss="mse", optimizer=Adam(learning_rate))

        return model

    def _build_critic(self, learning_rate=1e-3):
        """basic NN model.
        """
        # inputs
        s_inputs = Input(shape=(self.env.num_state, ))
        a_inputs = Input(shape=(1, ))

        # combine inputs
        x = concatenate([s_inputs, a_inputs])

        # bn after input
        x = BatchNormalization()(x)

        # Q_ex network

        # bn after activation
        x1 = Dense(32, activation="relu")(x)
        x1 = BatchNormalization()(x1)

        x1 = Dense(64, activation="relu")(x1)
        x1 = BatchNormalization()(x1)

        # no bn for output layer
        output1 = Dense(1, activation="linear")(x1)

        model_Q_ex = Model(inputs=[s_inputs, a_inputs], outputs=output1)
        model_Q_ex.compile(loss="mse", optimizer=Adam(learning_rate))

        # Q_ex2 network

        # bn after activation
        x2 = Dense(32, activation="relu")(x)
        x2 = BatchNormalization()(x2)

        # bn after activation
        x2 = Dense(64, activation="relu")(x2)
        x2 = BatchNormalization()(x2)

        # no bn for output layer
        output2 = Dense(1, activation="linear")(x2)

        model_Q_ex2 = Model(inputs=[s_inputs, a_inputs], outputs=output2)
        model_Q_ex2.compile(loss="mse", optimizer=Adam(learning_rate))

        # Q
        output3 = Lambda(
            lambda o: o[0] - self.ra_c * K.sqrt(K.max(o[1] - o[0] * o[0], 0)))(
                [output1, output2])
        model_Q = Model(inputs=[s_inputs, a_inputs], outputs=output3)
        model_Q.compile(loss="mse", optimizer=Adam(learning_rate))

        return model_Q_ex, model_Q_ex2, model_Q

    def actor_optimizer(self):
        """actor_optimizer.
        Returns:
            function, opt function for actor.
        """
        self.ainput = self.actor.input
        aoutput = self.actor.output
        trainable_weights = self.actor.trainable_weights
        self.action_gradient = tf.placeholder(tf.float32, shape=(None, 1))

        # tf.gradients calculates dy/dx with a initial gradients for y
        # action_gradient is dq/da, so this is dq/da * da/dparams
        params_grad = tf.gradients(aoutput, trainable_weights,
                                   -self.action_gradient)
        grads = zip(params_grad, trainable_weights)
        self.opt = tf.train.AdamOptimizer(self.actor_lr).apply_gradients(grads)
        self.sess.run(tf.global_variables_initializer())

    def critic_gradient(self):
        """get critic gradient function.
        Returns:
            function, gradient function for critic.
        """
        cinput = self.critic_Q.input
        coutput = self.critic_Q.output

        # compute the gradient of the action with q value, dq/da.
        action_grads = K.gradients(coutput, cinput[1])

        return K.function([cinput[0], cinput[1]], action_grads)

    def egreedy_action(self, X):
        """get actor action with ou noise.
        Arguments:
            X: state value.
        """
        # do the epsilon greedy way; not using OU
        if np.random.rand() <= self.epsilon:
            action = env.action_space.sample()

            # may use for 2nd round training
            # action = self.actor.predict(X)[0][0]
            # noise = np.clip(np.random.normal(0, self.policy_noise), -self.noise_clip, self.noise_clip)
            # action = np.clip(action + noise, 0, self.env.num_contract * 100)
        else:
            action = self.actor.predict(X)[0][0]

        return action, None, None

    def update_epsilon(self):
        """update epsilon
        """
        if self.epsilon >= self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def remember(self, state, action, reward, next_state, done):
        """add data to experience replay.
        Arguments:
            state: observation
            action: action
            reward: reward
            next_state: next_observation
            done: if game is done.
        """
        self.replay_buffer.add(state, action, reward, next_state, done)

    def process_batch(self, batch_size):
        """process batch data
        Arguments:
            batch: batch size
        Returns:
            states: batch of states
            actions: batch of actions
            target_q_ex, target_q_ex2: batch of targets;
            weights: priority weights
        """
        # prioritized sample from experience replay buffer
        experience = self.replay_buffer.sample(batch_size,
                                               beta=self.beta_schedule.value(
                                                   self.t))
        (states, actions, rewards, next_states, dones, weights,
         batch_idxes) = experience

        actions = actions.reshape(-1, 1)
        rewards = rewards.reshape(-1, 1)
        dones = dones.reshape(-1, 1)

        # get next_actions
        next_actions = self.actor_hat.predict(next_states)

        # prepare targets for Q_ex and Q_ex2 training
        q_ex_next = self.critic_Q_ex_hat.predict([next_states, next_actions])
        q_ex2_next = self.critic_Q_ex2_hat.predict([next_states, next_actions])

        target_q_ex = rewards + (1 - dones) * q_ex_next
        target_q_ex2 = rewards**2 + (1 - dones) * (2 * rewards * q_ex_next +
                                                   q_ex2_next)

        # use Q2 TD error as priority weight
        td_errors = self.critic_Q_ex2.predict([states, actions]) - target_q_ex2
        new_priorities = (np.abs(td_errors) +
                          self.prioritized_replay_eps).flatten()
        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

        return states, actions, target_q_ex, target_q_ex2, weights

    def update_model(self, X1, X2, y1, y2, weights):
        """update ddpg model.
        Arguments:
            X1: states
            X2: actions
            y1: target for Q_ex
            y2: target for Q_ex2
            weights: priority weights
        Returns:
            loss_ex: critic Q_ex loss
            loss_ex2: critic Q_ex2 loss
        """
        # flatten to prepare for training with weights
        weights = weights.flatten()

        # default batch size is 32
        loss_ex = self.critic_Q_ex.fit([X1, X2],
                                       y1,
                                       sample_weight=weights,
                                       verbose=0)
        loss_ex = np.mean(loss_ex.history['loss'])

        # default batch size is 32
        loss_ex2 = self.critic_Q_ex2.fit([X1, X2],
                                         y2,
                                         sample_weight=weights,
                                         verbose=0)
        loss_ex2 = np.mean(loss_ex2.history['loss'])

        X3 = self.actor.predict(X1)

        a_grads = np.array(self.get_critic_grad([X1, X3]))[0]
        self.sess.run(self.opt,
                      feed_dict={
                          self.ainput: X1,
                          self.action_gradient: a_grads
                      })

        return loss_ex, loss_ex2

    def update_target_model(self):
        """soft update target model.
        """
        critic_Q_ex_weights = self.critic_Q_ex.get_weights()
        critic_Q_ex2_weights = self.critic_Q_ex2.get_weights()
        actor_weights = self.actor.get_weights()

        critic_Q_ex_hat_weights = self.critic_Q_ex_hat.get_weights()
        critic_Q_ex2_hat_weights = self.critic_Q_ex2_hat.get_weights()
        actor_hat_weights = self.actor_hat.get_weights()

        for i in range(len(critic_Q_ex_weights)):
            critic_Q_ex_hat_weights[i] = self.TAU * critic_Q_ex_weights[i] + (
                1 - self.TAU) * critic_Q_ex_hat_weights[i]

        for i in range(len(critic_Q_ex2_weights)):
            critic_Q_ex2_hat_weights[i] = self.TAU * critic_Q_ex2_weights[
                i] + (1 - self.TAU) * critic_Q_ex2_hat_weights[i]

        for i in range(len(actor_weights)):
            actor_hat_weights[i] = self.TAU * actor_weights[i] + (
                1 - self.TAU) * actor_hat_weights[i]

        self.critic_Q_ex_hat.set_weights(critic_Q_ex_hat_weights)
        self.critic_Q_ex2_hat.set_weights(critic_Q_ex2_hat_weights)
        self.actor_hat.set_weights(actor_hat_weights)

    def train(self, episode):
        """training
        Arguments:
            episode: total episodes to run

        Returns:
            history: training history
        """

        # some statistics
        history = {
            "episode": [],
            "episode_w_T": [],
            "loss_ex": [],
            "loss_ex2": []
        }

        for i in range(episode):
            observation = self.env.reset()
            done = False

            # for recording purpose
            y_action = np.empty(0, dtype=int)
            reward_store = np.empty(0)

            self.t = i

            # steps in an episode
            while not done:

                # prepare state
                x = np.array(observation).reshape(1, -1)

                # chocie action from epsilon-greedy.
                action, _, _ = self.egreedy_action(x)

                # one step
                observation, reward, done, info = self.env.step(action)

                # record action and reward
                y_action = np.append(y_action, action)
                reward_store = np.append(reward_store, reward)

                # store to memory
                self.remember(x[0], action, reward, observation, done)

                if len(self.replay_buffer) > self.batch_size:

                    # draw from memory
                    X1, X2, y_ex, y_ex2, weights = self.process_batch(
                        self.batch_size)

                    # update model
                    loss_ex, loss_ex2 = self.update_model(
                        X1, X2, y_ex, y_ex2, weights)

                    # soft update target
                    self.update_target_model()

            # reduce epsilon per episode
            self.update_epsilon()

            # print/store some statistics every 1000 episodes
            if i % 1000 == 0 and i != 0:

                # may want to print/store some statistics every 100 episodes
                # if i % 100 == 0 and i >= 1000:

                # get w_T for statistics
                w_T = np.sum(reward_store)

                history["episode"].append(i)
                history["episode_w_T"].append(w_T)
                history["loss_ex"].append(loss_ex)
                history["loss_ex2"].append(loss_ex2)

                path_row = info["path_row"]
                print(info)
                print(
                    "episode: {} | episode final wealth: {:.3f} | loss_ex: {:.3f} | loss_ex2: {:.3f} | epsilon:{:.2f}"
                    .format(i, w_T, loss_ex, loss_ex2, self.epsilon))

                with np.printoptions(precision=2, suppress=True):
                    print("episode: {} | rewards {}".format(i, reward_store))
                    print("episode: {} | actions taken {}".format(i, y_action))
                    print("episode: {} | deltas {}".format(
                        i, self.env.delta_path[path_row] * 100))
                    print("episode: {} | stock price {}".format(
                        i, self.env.path[path_row]))
                    print("episode: {} | option price {}\n".format(
                        i, self.env.option_price_path[path_row] * 100))

                # may want to save model every 100 episode
                # if i % 100 == 0:
                #     self.actor.save_weights("model/ddpg_actor_" + str(int(i/100)) + ".h5")
                #     self.critic_Q_ex.save_weights("model/ddpg_critic_Q_ex_" + str(int(i/100)) + ".h5")
                #     self.critic_Q_ex2.save_weights("model/ddpg_critic_Q_ex2_" + str(int(i/100)) + ".h5")
                self.actor.save_weights("model/ddpg_actor_" +
                                        str(int(i / 1000)) + ".h5")
                self.critic_Q_ex.save_weights("model/ddpg_critic_Q_ex_" +
                                              str(int(i / 1000)) + ".h5")
                self.critic_Q_ex2.save_weights("model/ddpg_critic_Q_ex2_" +
                                               str(int(i / 1000)) + ".h5")

        # save weights once training is done
        self.actor.save_weights("model/ddpg_actor.h5")
        self.critic_Q_ex.save_weights("model/ddpg_critic_Q_ex.h5")
        self.critic_Q_ex2.save_weights("model/ddpg_critic_Q_ex2.h5")

        return history
class DQNAgent(object):
    def __init__(self, stateShape, actionSpace, numPicks, memorySize, burnin=1000):
        self.numPicks = numPicks
        self.memorySize = memorySize
        self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6)
        self.stateShape = stateShape
        self.actionSpace = actionSpace

        self.step = 0
        self.sync = 200
        self.burnin = burnin

        self.alpha = 0.001
        self.epsilon = 1
        self.epsilon_decay = 0.5
        self.epsilon_min = 0.01
        self.eps_threshold = 0

        self.gamma = 0.99

        self.trainNetwork = self.createNetwork(
            stateShape, len(actionSpace), self.alpha)
        self.targetNetwork = self.createNetwork(
            stateShape, len(actionSpace), self.alpha)
        self.targetNetwork.set_weights(
            self.trainNetwork.get_weights())

    def createNetwork(self, n_input, n_output, learningRate):
        model = keras.models.Sequential()

        model.add(keras.layers.Dense(
            24, activation='relu', input_shape=n_input))
        model.add(keras.layers.Dense(48, activation='relu'))
        model.add(keras.layers.Dense(n_output, activation='linear'))
        model.compile(
            loss='mse', optimizer=keras.optimizers.Adam(lr=learningRate))
        print(model.summary())
        return model

    def trainDQN(self):
        if len(self.replayMemory) <= self.numPicks or len(self.replayMemory) < self.burnin:
            return 0

        beta = 0.4 + self.step * (1.0 - 0.4) / 300
        samples = self.replayMemory.sample(self.numPicks, beta)
        #batch = Transition(*zip(*samples))
        currStates, actions, rewards, nextStates, dones, weights, indices = samples

        currStates = np.squeeze(np.array(currStates), 1)
        Q_currents = self.trainNetwork(currStates, training=False).numpy()

        nextStates = np.squeeze(np.array(nextStates), 1)
        Q_futures = self.targetNetwork(nextStates, training=False).numpy().max(axis=1)

        rewards = np.array(rewards).reshape(self.numPicks,).astype(float)
        actions = np.array(actions).reshape(self.numPicks,).astype(int)

        dones = np.array(dones).astype(bool)
        notDones = (~dones).astype(float)
        dones = dones.astype(float)

        Q_currents_cp = deepcopy(Q_currents)
        Q_currents_cp[np.arange(self.numPicks), actions] = rewards * dones + (rewards + Q_futures * self.gamma)*notDones

        loss = tf.multiply(tf.pow(tf.subtract(Q_currents[np.arange(self.numPicks), actions], Q_currents_cp[np.arange(self.numPicks), actions]), 2), weights).numpy()
        prios = loss + 1e-5
        self.replayMemory.update_priorities(indices, prios)

        loss = self.trainNetwork.train_on_batch(currStates, Q_currents)
        return loss

    def selectAction(self, state):
        self.step += 1

        if self.step % self.sync == 0:
            self.targetNetwork.set_weights(
                self.trainNetwork.get_weights())

        q = -100000
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, 3)
        else:
            preds = np.squeeze(self.trainNetwork(
                state, training=False).numpy(), axis=0)
            action = np.argmax(preds)
            q = preds[action]
        return action, q

    def addMemory(self, state, action, reward, nextState, done):
        self.replayMemory.add(state, action, reward, nextState, done)

    def save(self):
        save_path = (
            f"./mountain_car_tfngmo_{int(self.step)}.chkpt"
        )
        self.trainNetwork.save(
            save_path
        )
        print(f"MountainNet saved to {save_path} done!")
Beispiel #20
0
def learn(env, args, callback=None):
    dist_deepq = DistDeepQ(env, args)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            args.buffer_size, alpha=args.prioritized_replay_alpha)
        args.prioritized_replay_beta_iters = args.max_timesteps
        beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters,
                                       initial_p=args.prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)
        beta_schedule = None

    exploration = LinearSchedule(schedule_timesteps=int(
        args.exploration_fraction * args.max_timesteps),
                                 initial_p=1.0,
                                 final_p=args.exploration_final_eps)

    # dist_deepq.sample_noise()
    dist_deepq.update_target()
    episode_rewards = [0.0]
    saved_mean_reward = None
    ob = env.reset()

    for t in range(args.max_timesteps):
        if callback is not None:
            if callback(locals(), globals()):
                break

        update_eps = exploration.value(t)
        action = dist_deepq.act_distributional(ob, update_eps)
        # action = dist_deepq.act_noisy_distributional(ob)
        new_ob, rew, done, _ = env.step(action)
        replay_buffer.add(ob, action, rew, new_ob, float(done))
        ob = new_ob

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
            reset = True

        if t > args.learning_starts and t % args.train_freq == 0:
            if args.prioritized_replay:
                experience = replay_buffer.sample(args.batch_size,
                                                  beta=beta_schedule.value(t))
                (obs, actions, rewards, obs_next, dones, weights,
                 batch_idxes) = experience
            else:
                obs, actions, rewards, obs_next, dones = replay_buffer.sample(
                    args.batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            kl_errors = dist_deepq.distributional_update(
                obs, actions, rewards, obs_next, dones, weights)
            # dist_deepq.sample_noise()

            if args.prioritized_replay:
                replay_buffer.update_priorities(batch_idxes, kl_errors)

        if t > args.learning_starts and t % args.target_network_update_freq == 0:
            dist_deepq.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and args.print_freq is not None and len(
                episode_rewards) % args.print_freq == 0:
            print('steps {} episodes {} mean reward {}'.format(
                t, num_episodes, mean_100ep_reward))
            '''
Beispiel #21
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
#    env = gym.make("FrozenLake8x8rob-v0")
#    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")
    
    
    
    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen):
            for j in range(np.shape(obses_t)[1] - windowLen):
                deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen):
                for k in range(np.shape(obses_t)[2] - windowLen):
                    deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:])
                    deicticWeights.append(weights[i])
        return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
#        hiddens=[256],  # used in pong
#        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        convs=[(4,3,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True
    )

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
#    max_timesteps=50000
    max_timesteps=20000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.
    target_network_update_freq=500
    prioritized_replay=False
#    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16
    
    deicticShape = (3,3,1)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size*25,)
    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    
    sess = U.make_session(num_cpu)
    sess.__enter__()

#    act, train, update_target, debug = build_graph.build_train(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
        make_obs_ph=make_obs_ph,
        make_match_ph=make_match_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):
        
        # get action to take
#        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
#        qvalues = getq(np.array(obs)[None])
#        action = np.argmax(qvalues)
#        if np.random.rand() < exploration.value(t):
#            action = np.random.randint(env.action_space.n)
        
        deicticObs = getDeicticObs(obs,3)
        qvalues = getq(np.array(deicticObs))
        action = np.argmax(np.max(qvalues,0))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)
        
#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
        
        new_obs, rew, done, _ = env.step(action)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3)
            
            obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])]
            _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True)
#            matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)]
            
#            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
#            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
class Agent:
    def __init__(
            self,
            env: 'Environment',
            input_frame: ('int: the number of channels of input image'),
            input_dim: (
                'int: the width and height of pre-processed input image'),
            input_type: ('str: the type of input dimension'),
            num_frames: ('int: Total number of frames'),
            skipped_frame: ('int: The number of skipped frames'),
            eps_decay: ('float: Epsilon Decay_rate'),
            gamma: ('float: Discount Factor'),
            target_update_freq: ('int: Target Update Frequency (by frames)'),
            update_type: (
                'str: Update type for target network. Hard or Soft') = 'hard',
            soft_update_tau: ('float: Soft update ratio') = None,
            batch_size: ('int: Update batch size') = 32,
            buffer_size: ('int: Replay buffer size') = 1000000,
            alpha: (
                'int: Hyperparameter for how large prioritization is applied'
            ) = 0.5,
            beta:
        ('int: Hyperparameter for the annealing factor of importance sampling'
         ) = 0.5,
            epsilon_for_priority:
        ('float: Hyperparameter for adding small increment to the priority'
         ) = 1e-6,
            update_start_buffer_size: (
                'int: Update starting buffer size') = 50000,
            learning_rate: ('float: Learning rate') = 0.0004,
            eps_min: ('float: Epsilon Min') = 0.1,
            eps_max: ('float: Epsilon Max') = 1.0,
            device_num: ('int: GPU device number') = 0,
            rand_seed: ('int: Random seed') = None,
            plot_option: ('str: Plotting option') = False,
            model_path: ('str: Model saving path') = './'):

        self.action_dim = env.action_space.n
        self.device = torch.device(
            f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path

        self.env = env
        self.input_frames = input_frame
        self.input_dim = input_dim
        self.num_frames = num_frames
        self.skipped_frame = skipped_frame
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option

        # hyper parameters for PER
        self.alpha = alpha
        self.beta = beta
        self.beta_step = (1.0 - beta) / num_frames
        self.epsilon_for_priority = epsilon_for_priority

        if input_type == '1-dim':
            self.q_current = QNetwork_1dim(self.input_dim,
                                           self.action_dim).to(self.device)
            self.q_target = QNetwork_1dim(self.input_dim,
                                          self.action_dim).to(self.device)
        else:
            self.q_current = QNetwork(
                (self.input_frames, self.input_dim, self.input_dim),
                self.action_dim).to(self.device)
            self.q_target = QNetwork(
                (self.input_frames, self.input_dim, self.input_dim),
                self.action_dim).to(self.device)
        self.q_target.load_state_dict(self.q_current.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_current.parameters(),
                                    lr=learning_rate)

        if input_type == '1-dim':
            self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                                  self.input_dim,
                                                  self.batch_size, self.alpha,
                                                  input_type)
        else:
            self.memory = PrioritizedReplayBuffer(
                self.buffer_size,
                (self.input_frames, self.input_dim, self.input_dim),
                self.batch_size, self.alpha, input_type)

    def select_action(
        self, state:
        'Must be pre-processed in the same way while updating current Q network. See def _compute_loss'
    ):

        if np.random.random() < self.epsilon:
            return np.zeros(self.action_dim), self.env.action_space.sample()
        else:
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
            Qs = self.q_current(state)
            action = Qs.argmax()
            return Qs.detach().cpu().numpy(), action.detach().item()

    def processing_resize_and_gray(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)  # Pure
        # frame = cv2.cvtColor(frame[:177, 32:128, :], cv2.COLOR_RGB2GRAY) # Boxing
        # frame = cv2.cvtColor(frame[2:198, 7:-7, :], cv2.COLOR_RGB2GRAY) # Breakout
        frame = cv2.resize(frame,
                           dsize=(self.input_dim, self.input_dim)).reshape(
                               self.input_dim, self.input_dim).astype(np.uint8)
        return frame

    def get_state(self, state, action, skipped_frame=0):
        '''
        num_frames: how many frames to be merged
        input_size: hight and width of input resized image
        skipped_frame: how many frames to be skipped
        '''
        next_state = np.zeros(
            (self.input_frames, self.input_dim, self.input_dim))
        for i in range(len(state) - 1):
            next_state[i] = state[i + 1]

        rewards = 0
        dones = 0
        for j in range(skipped_frame):
            state, reward, done, _ = self.env.step(action)
            rewards += reward
            dones += int(done)
        state, reward, done, _ = self.env.step(action)
        next_state[-1] = self.processing_resize_and_gray(state)
        rewards += reward
        dones += int(done)
        return rewards, next_state, dones

    def get_state_1dim(self, state, action, skipped_frame=0):
        '''
        num_frames: how many frames to be merged
        input_size: hight and width of input resized image
        skipped_frame: how many frames to be skipped
        '''
        next_state = np.zeros((self.input_frames, self.input_dim))
        for i in range(len(state) - 1):
            next_state[i] = state[i + 1]

        rewards = 0
        dones = 0
        for _ in range(skipped_frame):
            state, reward, done, _ = self.env.step(action)
            rewards += reward
            dones += int(done)
        state, reward, done, _ = self.env.step(action)
        next_state[-1] = state
        rewards += reward
        dones += int(done)
        return rewards, next_state, dones

    def get_init_state(self):

        init_state = np.zeros(
            (self.input_frames, self.input_dim, self.input_dim))
        init_frame = self.env.reset()
        init_state[0] = self.processing_resize_and_gray(init_frame)

        for i in range(1, self.input_frames):
            action = self.env.action_space.sample()
            for j in range(self.skipped_frame):
                state, _, _, _ = self.env.step(action)
            state, _, _, _ = self.env.step(action)
            init_state[i] = self.processing_resize_and_gray(state)
        return init_state

    def get_init_state_1dim(self):

        init_state = np.zeros((self.input_frames, self.input_dim))
        init_frame = self.env.reset()
        init_state[0] = init_frame

        for i in range(1, self.input_frames):
            action = self.env.action_space.sample()
            for j in range(self.skipped_frame):
                state, _, _, _ = self.env.step(action)
            state, _, _, _ = self.env.step(action)
            init_state[i] = state
        return init_state

    def store(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def update_current_q_net(self):
        '''The diffent method between Dueling and PER in the Agent class'''
        batch = self.memory.batch_load(self.beta)
        weights = torch.FloatTensor(batch['weights'].reshape(-1, 1)).to(
            self.device)
        sample_wise_loss = self._compute_loss(
            batch)  # PER: shape of loss -> (batch, 1)
        loss = torch.mean(sample_wise_loss * weights)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # For PER: update priorities of the samples.
        sample_wise_loss = sample_wise_loss.detach().cpu().numpy()
        batch_priorities = sample_wise_loss + self.epsilon_for_priority
        self.memory.update_priorities(batch['indices'], batch_priorities)

        return loss.item()

    def target_soft_update(self):
        for target_param, current_param in zip(self.q_target.parameters(),
                                               self.q_current.parameters()):
            target_param.data.copy_(self.tau * current_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def target_hard_update(self):
        self.update_cnt = (self.update_cnt + 1) % self.target_update_freq
        if self.update_cnt == 0:
            self.q_target.load_state_dict(self.q_current.state_dict())

    def train(self):
        tic = time.time()
        losses = []
        scores = []
        epsilons = []
        avg_scores = [[-1000]]

        score = 0

        print("Storing initial buffer..")
        # state = self.get_init_state()
        # state = self.get_init_state_1dim()
        state = self.env.reset()
        for frame_idx in range(1, self.update_start + 1):
            _, action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action)
            self.store(state, action, reward, next_state, done)
            state = next_state
            if done: state = self.env.reset()

        print("Done. Start learning..")
        history_store = []
        for frame_idx in range(1, self.num_frames + 1):
            Qs, action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action)
            self.store(state, action, reward, next_state, done)
            history_store.append([state, Qs, action, reward, next_state, done])
            loss = self.update_current_q_net()

            if self.update_type == 'hard': self.target_hard_update()
            elif self.update_type == 'soft': self.target_soft_update()

            score += reward
            losses.append(loss)

            if done:
                scores.append(score)
                if np.mean(scores[-10:]) > max(avg_scores):
                    torch.save(
                        self.q_current.state_dict(),
                        self.model_path + '{}_Score:{}.pt'.format(
                            frame_idx, np.mean(scores[-10:])))
                    training_time = round((time.time() - tic) / 3600, 1)
                    np.save(
                        self.model_path +
                        '{}_history_Score_{}_{}hrs.npy'.format(
                            frame_idx, score, training_time),
                        np.array(history_store))
                    print(
                        "          | Model saved. Recent scores: {}, Training time: {}hrs"
                        .format(scores[-10:], training_time),
                        ' /'.join(os.getcwd().split('/')[-3:]))
                avg_scores.append(np.mean(scores[-10:]))

                if self.plot_option == 'inline':
                    scores.append(score)
                    epsilons.append(self.epsilon)
                    self._plot(frame_idx, scores, losses, epsilons)
                elif self.plot_option == 'wandb':
                    Q_mean = np.mean(np.array(history_store)[:, 1])
                    wandb.log({
                        'Score': score,
                        'loss(10 frames avg)': np.mean(losses[-10:]),
                        'Q (mean)': Q_mean,
                        'Epsilon': self.epsilon,
                        'beta': self.beta
                    })
                    print(score, end='\r')
                else:
                    print(score, end='\r')

                score = 0
                state = self.env.reset()
                history_store = []
            else:
                state = next_state

            self._epsilon_step()

            # self.beta = min(self.beta+self.beta_step, 1.0) # for PER. beta is increased linearly up to 1.0
            fraction = min(frame_idx / self.num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

        print("Total training time: {}(hrs)".format(
            (time.time() - tic) / 3600))

    def _epsilon_step(self):
        ''' Epsilon decay control '''
        eps_decay_init = 1 / 1200000
        eps_decay = [
            eps_decay_init, eps_decay_init / 2.5, eps_decay_init / 3.5,
            eps_decay_init / 5.5
        ]

        if self.epsilon > 0.30:
            self.epsilon = max(self.epsilon - eps_decay[0], 0.1)
        elif self.epsilon > 0.27:
            self.epsilon = max(self.epsilon - eps_decay[1], 0.1)
        elif self.epsilon > 1.7:
            self.epsilon = max(self.epsilon - eps_decay[2], 0.1)
        else:
            self.epsilon = max(self.epsilon - eps_decay[3], 0.1)

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        # If normalization is used, it must be applied to 'state' and 'next_state' here. ex) state/255
        states = torch.FloatTensor(batch['states']).to(self.device)
        next_states = torch.FloatTensor(batch['next_states']).to(self.device)
        actions = torch.LongTensor(batch['actions'].reshape(-1,
                                                            1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(
            self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1,
                                                         1)).to(self.device)

        current_q = self.q_current(states).gather(1, actions)
        # The next line is the only difference from Vanila DQN.
        next_q = self.q_target(next_states).gather(
            1,
            self.q_current(next_states).argmax(axis=1, keepdim=True)).detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)

        # For PER, the shape of loss is (batch, 1). Therefore, using "reduction='none'" option.
        sample_wise_loss = F.smooth_l1_loss(current_q,
                                            target,
                                            reduction="none")
        return sample_wise_loss

    def _plot(self, frame_idx, scores, losses, epsilons):
        clear_output(True)
        plt.figure(figsize=(20, 5), facecolor='w')
        plt.subplot(131)
        plt.title('frame %s. score: %s' % (frame_idx, np.mean(scores[-10:])))
        plt.plot(scores)
        plt.subplot(132)
        plt.title('loss')
        plt.plot(losses)
        plt.subplot(133)
        plt.title('epsilons')
        plt.plot(epsilons)
        plt.show()
Beispiel #23
0
class DQNAgent:
    def __init__(
            self,
            env,
            memory_size,
            batch_size,
            target_update=100,
            gamma=0.99,
            # replay parameters
            alpha=0.2,
            beta=0.6,
            prior_eps=1e-6,
            # Categorical DQN parameters
            v_min=0,
            v_max=200,
            atom_size=51,
            # N-step Learning
            n_step=3,
            start_train=32,
            save_weights=True,
            log=True,
            lr=0.001,
            seed=0,
            episodes=200):

        self.env = env

        obs_dim = self.env.observation_dim
        action_dim = self.env.action_dim

        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        self.lr = lr
        self.memory_size = memory_size
        self.seed = seed

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)

        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(obs_dim,
                                              memory_size,
                                              batch_size,
                                              alpha=alpha)

        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(obs_dim,
                                         memory_size,
                                         batch_size,
                                         n_step=n_step,
                                         gamma=gamma)

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atom_size).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(obs_dim, action_dim, self.atom_size,
                           self.support).to(self.device)
        self.dqn_target = Network(obs_dim, action_dim, self.atom_size,
                                  self.support).to(self.device)

        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

        # transition to store in memory
        self.transition = list()

        self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10))

        self.start_train = start_train

        self.save_weights = save_weights

        self.time = datetime.datetime.now().timetuple()
        self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}"

        self.log = log
        self.episode_cnt = 0
        self.episodes = episodes

        if self.save_weights is True:
            self.create_save_directory()

        plt.ion()

    def create_save_directory(self):
        try:
            os.mkdir(self.path)
        except OSError:
            print("Creation of the directory %s failed" % self.path)
        else:
            print("Successfully created the directory %s " % self.path)

    def select_action(self, state):
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = self.dqn(torch.FloatTensor(state).to(
            self.device)).argmax()
        selected_action = selected_action.detach().cpu().numpy()

        self.transition = [state, selected_action]

        return selected_action

    def step(self, action):
        """Take an action and return the response of the env."""
        next_state, reward, done = self.env.step(action)

        self.transition += [reward, next_state, done]

        # N-step transition
        if self.use_n_step:
            one_step_transition = self.memory_n.store(*self.transition)
        # 1-step transition
        else:
            one_step_transition = self.transition

        # add a single step transition
        if one_step_transition:
            self.memory.store(*one_step_transition)

        return next_state, reward, done

    def update_model(self):
        """Update the model by gradient descent."""
        # PER needs beta to calculate weights
        samples = self.memory.sample_batch(self.beta)
        weights = torch.FloatTensor(samples["weights"].reshape(-1, 1)).to(
            self.device)
        indices = samples["indices"]

        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)

        # PER: importance sampling before average
        loss = torch.mean(elementwise_loss * weights)

        # N-step Learning loss
        # we are gonna combine 1-step loss and n-step loss so as to
        # prevent high-variance. The original rainbow employs n-step loss only.
        if self.use_n_step:
            gamma = self.gamma**self.n_step
            samples = self.memory_n.sample_batch_from_idxs(indices)
            elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
            elementwise_loss += elementwise_loss_n_loss

            # PER: importance sampling before average
            loss = torch.mean(elementwise_loss * weights)

        self.optimizer.zero_grad()
        loss.backward()
        # print(loss)
        clip_grad_norm_(self.dqn.parameters(), 10.0)
        self.optimizer.step()

        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)

        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()

        return loss.item()

    def train(self, num_frames, plotting_interval=100):
        """Train the agent."""

        if self.log:
            pass
            # config = {'gamma': self.gamma, 'log_interval': plotting_interval, 'learning_rate': self.lr,
            #           'directory': self.path, 'type': 'dqn', 'replay_memory': self.memory_size, 'environment': 'normal', 'seed': self.seed}
            # wandb.init(project='is_os', entity='pydqn', config=config, notes=self.env.reward_function, reinit=True, tags=['report'])
            # wandb.watch(self.dqn)

        self.env.reset()
        state = self.env.get_state()
        won = False
        update_cnt = 0
        losses = []
        scores = []
        score = 0
        frame_cnt = 0
        self.episode_cnt = 0

        for frame_idx in range(1, num_frames + 1):
            frame_cnt += 1
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            fraction = min(frame_cnt / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if agent has trained 500 frames, terminate
            if frame_cnt == 500:
                done = True

            # if episode ends
            if done:
                if reward > 0:
                    won = True
                self.env.reset()
                state = self.env.get_state()
                self.episode_cnt += 1
                scores.append(score)
                score = 0
                frame_cnt = 0

            # if training is ready
            if len(self.memory) >= self.batch_size:
                loss = self.update_model()
                losses.append(loss)
                update_cnt += 1

                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update()

            # plotting
            if frame_idx % plotting_interval == 0:
                self._plot(frame_idx, scores, losses)

            if frame_idx % 1000 == 0:
                torch.save(self.dqn.state_dict(),
                           f'{self.path}/{frame_idx}.tar')
                print(f"model saved at:\n {self.path}/{frame_idx}.tar")

        # wandb.run.summary['won'] = won
        self.env.close()

    def _compute_dqn_loss(self, samples, gamma):
        """Return categorical dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["acts"]).to(device)
        reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device)
        done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device)

        # Categorical DQN algorithm
        delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

        with torch.no_grad():
            # Double DQN
            next_action = self.dqn(next_state).argmax(1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = next_dist[range(self.batch_size), next_action]

            t_z = reward + (1 - done) * gamma * self.support
            t_z = t_z.clamp(min=self.v_min, max=self.v_max)
            b = (t_z - self.v_min) / delta_z
            l = b.floor().long()
            u = b.ceil().long()

            offset = (torch.linspace(
                0, (self.batch_size - 1) * self.atom_size,
                self.batch_size).long().unsqueeze(1).expand(
                    self.batch_size, self.atom_size).to(self.device))

            proj_dist = torch.zeros(next_dist.size(), device=self.device)
            proj_dist.view(-1).index_add_(0, (l + offset).view(-1),
                                          (next_dist *
                                           (u.float() - b)).view(-1))
            proj_dist.view(-1).index_add_(0, (u + offset).view(-1),
                                          (next_dist *
                                           (b - l.float())).view(-1))

        dist = self.dqn.dist(state)
        log_p = torch.log(dist[range(self.batch_size), action])
        elementwise_loss = -(proj_dist * log_p).sum(1)

        return elementwise_loss

    def _target_hard_update(self):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())

    def _plot(self, frame_cnt, scores, losses):
        self.ax1.cla()
        self.ax1.set_title(
            f'frames: {frame_cnt} score: {np.mean(scores[-10:])}')
        self.ax1.plot(scores[-999:], color='red')
        self.ax2.cla()
        self.ax2.set_title(f'loss: {np.mean(losses[-10:])}')
        self.ax2.plot(losses[-999:], color='blue')
        plt.show()
        plt.pause(0.1)

        # needed for wandb to not log nans
        # if frame_cnt < self.start_train + 11:
        #     loss = 0
        # else:
        #     loss = np.mean(losses[-10:])

        if self.log:
            pass
Beispiel #24
0
class RL_AGENT_ONE():
    """
    RL agent class
    """
    def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init,
        gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False ):
        self.step_now = 0 # record the step
        self.reward_num = 0
        self.reward_accumulated = 0 # delay reward
        self.final_tem = 10 # just for now
        self.step_last_update = 0 # record the last update time 
        self.update_period = update_period # for the off policy
        self.learn_start_time = learn_start_time 
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.alpha = 0.6
        self.beta = 0.4
        self.replay_bata_iters = replay_iters 
        self.replay_eps = 1e-6
        self.memory_min_num = 1000 #she min num to learn
        self.step_last_learn = 0 # record the last learn step
        self.learn_fre = learn_fre # step frequency to learn
        self.e_greedy = 1 # record the e_greedy
        self.eps_T = eps_T # par for updating the maybe step 80,0000
        self.eps_t_init = eps_t_init # par for updating the eps
         
        self.device = device
        self.model_path = model_path
        self.mode_enjoy = model_load
        if model_load == False: 
            self.policy_net = DQN(board[0], board[1], action_num).to(device)
            self.target_net = DQN(board[0], board[1], action_num).to(device)
            self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr)
            self.loss_fn = nn.functional.mse_loss # use the l1 loss
            self.memory = PrioritizedReplayBuffer(memory_size, self.alpha)
            self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0)
        else:
            self.load(o_model_name) 
        #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) 
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.action_old = None
        self.dqn_direct_flag = False # show if the dqn action is done
        self.model_save_flag = False
    
    def reset(self):
        """ 
        reset the flag, state, reward for a new half or game
        """
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.dqn_direct_flag = False

    def load(self, old_model):
        """
        load the trained model
        par:
        |old_model:str, the name of the old model
        """
        model_path_t = self.model_path + 't' + old_model
        self.target_net = torch.load(model_path_t, map_location=self.device)
        self.target_net.eval()
        print('target net par', self.target_net.state_dict())

    def save(self):
        """
        save the trained model
        """
        t = time.strftime('%m%d%H%M%S')
        self.model_path_p = self.model_path + 'p' + t + '.pt'
        self.model_path_t = self.model_path + 't' + t + '.pt'
        print('target net par is', self.policy_net.state_dict())
        torch.save(self.policy_net, self.model_path_p)
        torch.save(self.target_net, self.model_path_t)

    def learn(self, env, step_now, obs_old, action, obs_new, reward, done):
        """
        This func is used to learn the agent
        par:
        |step_now: int, the global time of training
        |env: class-Environment, use it for nothing
        |transition: action, obs_new, reward 
        |obs_old/new: instance obs
        |done: bool, if the game is over 
        """
        """ check if we should update the policy net """
        if step_now - self.step_last_update == self.update_period:
            self.step_last_update = step_now
            self.target_net.load_state_dict(self.policy_net.state_dict())
                
        """ init the obs_new for init learn """
        state_new = self.feature_combine(obs_new) # get the feature state
        state_old = self.feature_combine(obs_old) # get the feature state
        transition_now = (state_old, action, \
            reward, state_new)

        """ augument reward data to the memory """
        if reward > 0:
            self.memory.add(*self.data_augment(transition_now), done)
        self.memory.add(state_old, action, \
            reward, state_new, done)

        """ select the batch memory to update the network """
        step_diff = step_now - self.step_last_learn
        if step_now > self.learn_start_time and \
                step_diff >= self.learn_fre and \
                    self.memory.__len__() > self.memory_min_num:
            self.step_last_learn = step_now # update the self.last learn
            batch_data = self.memory.sample(self.batch_size, \
                    beta=self.beta_schedule.value(step_now))
            s_o_set, actions, rewards, s_n_set, dones, weights, idx_set = batch_data
            loss_list = []
            batch_idx_list = []
            reward_not_zero_cnt = 0
            actions = [torch.tensor(a, device=self.device) \
                    for a in actions]

            """ cnt how many times learn for non reward """
            actions_new = [self.policy_net(s_n).detach().max(0)[1] \
                    for s_n in s_n_set]
            target_values = [self.gamma*self.target_net(s_n).gather(0, actions_new[idx]) \
                    for idx, s_n in enumerate(s_n_set)]
            target_values = [t_*(1 - d_) + r_ \
                  for t_, d_, r_ in zip(target_values, dones, rewards)] 
            policy_values = [self.policy_net(s).gather(0, a) \
                    for s, a in zip(s_o_set, actions)]
            loss = [self.loss_fn(p_v, t_v)+ self.replay_eps \
                    for p_v, t_v in zip(policy_values, target_values)]
            loss_back = sum(loss) / self.batch_size

            """ update the par """
            self.optimizer.zero_grad()
            loss_back.backward()
            self.optimizer.step()
            self.memory.update_priorities(idx_set, torch.tensor(loss).detach().numpy())

        """ check if we should save the model """
        if self.model_save_flag == True:
            self.save()

    def select_egreedy(self, q_value, step_now):
        """
        select the action by e-greedy policy
        arg:
        |q_value: the greedy standard 
        """
        self.e_greedy = np.exp((self.eps_t_init - step_now) / self.eps_T)
        if self.e_greedy < 0.3:
            self.e_greedy = 0.3

        """ if we are in enjoying mode """
        if self.mode_enjoy == True:
            print('q_value is', q_value)
            self.e_greedy = 0.3

        """ select the action by e-greedy """
        if np.random.random() > self.e_greedy:
            action = action_list[ \
                    np.where(q_value==np.max(q_value))[0][0] ]
        else:
            action = action_list[np.random.randint(action_num)]
        return action

    def feature_combine(self, obs):
        """ 
        This file extract features from the obs.layers and 
        combine them into a new feature layer
        Used feature layers:    
        """
        """ combine all the layers """
        feature_c = obs.copy()
        feature_c = feature_c.astype(np.float32)
        feature_c = torch.tensor(feature_c, dtype=torch.float32, device=self.device)
        size = feature_c.shape
        feature_c = feature_c.resize_(1, 1, size[0], size[1])
        return feature_c

    def data_augment(self, transition):
        """
        use this func to flip the feature, to boost the experience,
        deal the problem of sparse reward
        par:
        |transition: tuple, with (feature_o, action, feature_n, reward) 
        """
        flip_ver_dim = 2
        feature_old = transition[0]
        action = transition[1]
        feature_new = transition[3]
        reward = transition[2]

        """ vertical flip """
        feature_o_aug = feature_old.flip([flip_ver_dim])
        feature_n_aug = feature_new.flip([flip_ver_dim])

        """ vertical :action flip """
        if action == 0:  action = 1
        elif action == 1: action = 0

        return feature_o_aug, action, reward, feature_n_aug

    def act(self, map, step_now):
        """ this func is interact with the competition func """
        dqn_action = -1 # reset
        state_old = self.feature_combine(map) # get the feature
        q_values = self.policy_net(state_old)
        action = self.select_egreedy( \
            q_values.cpu().detach().numpy(), step_now)# features to model

        return action

    def act_enjoy(self, map):
        """ this func is interact with the competition func """
        dqn_action = -1 # reset
        step_now = self.eps_T
        state_old = self.feature_combine(map) # get the feature
        q_values = self.target_net(state_old)
        action = self.select_egreedy( \
            q_values.cpu().detach().numpy(), step_now)# features to model

        return action
Beispiel #25
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
    env = gym.make("FrozenLake8x8nohole-v0")
    
#    robShape = (2,)
#    robShape = (3,)
#    robShape = (200,)
#    robShape = (16,)
    robShape = (64,)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

    # these params are specific to frozenlake
    def getOneHotObs(obs):
#        ident = np.identity(16)
        ident = np.identity(64)
        return ident[obs,:]

    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
    max_timesteps=50000
#    max_timesteps=10000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.0
    target_network_update_freq=500
#    prioritized_replay=False
    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16

#    # try mountaincar w/ different input dimensions
#    inputDims = [50,2]
    
    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    obs = getOneHotObs(obs)
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        new_obs = getOneHotObs(new_obs)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            obs = getOneHotObs(obs)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
Beispiel #26
0
def train(
  env_name,
  file_name, 
  network_type,
  env_seed = None,
  seed = None,
  
  buffer_size = int(1e5),
  alpha = 0.6,
  batch_size = 32,
  reward_min = -1.0,
  reward_max = 1.0,
  reward_discount = 0.99,
  
  epsilon_start = 1.0,
  epsilon_end = 0.05,
  epsilon_decay_step = int(1e6),
  beta_start = 0.4,
  beta_end = 1.0,
  beta_decay_step = int(1e7),
  lrs = [5e-5, 5e-6],
  lr_cutoff_steps = [int(8e6)],
  
  total_steps = int(1e7),
  initial_buffer_size = int(1e5),
  target_network_update_step = 1000,
  training_step = 4,
  last_k_episodes = 100,
  print_frequency = 10,
  save_frequency = 0.01
  ):
  # Create folders.
  if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)
  if not os.path.isdir(CSV_DIR):
    os.makedirs(CSV_DIR)
  
  # Create environment.
  env = make_atari(env_name)
  if env_seed is not None:
    env.seed(env_seed)
  obs_shape = env.observation_space.shape
  num_action = env.action_space.n
  
  if seed is not None:
    np.random.seed(seed)
    tf.set_random_seed(seed)
  
  # Initialize step schedules.
  epsilon = LinearSchedule(start = epsilon_start, end = epsilon_end, decay_step = epsilon_decay_step)
  beta = LinearSchedule(start = epsilon_start, end = epsilon_end, decay_step = epsilon_decay_step)
  learning_rate = StaircaseSchedule(values = lrs, cutoff_steps = lr_cutoff_steps)
  
  # Initialize replay buffer.
  replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha = alpha)
  
  # Build model graph.
  model_graph = ModelGraph(obs_shape, num_action, network_type = network_type, gamma = reward_discount)
  
  # Initialize session and variables.
  sess = tf.InteractiveSession()
  model_graph.initialize_variables()
  model_graph.update_target_network()
  
  start_time = time.time()
  list_step = []
  list_episodic_reward = []
  list_mean_episodic_reward = []
  episodic_reward = 0
  highest_episodic_reward = None
  
  obs = env.reset()
  for step in range(1, total_steps):
    # Synchronize the target network periodically (target network <- main network).
    if step > initial_buffer_size and step % target_network_update_step == 0:
      model_graph.update_target_network()
    
    # Sample action with epsilon-greedy policy.
    action = model_graph.epsilon_act(np.expand_dims(obs, axis = 0), epsilon.get_value(step))[0]
    
    # Interact with the environment.
    obs_next, reward, done, _ = env.step(action)
    episodic_reward += reward
    if done:
      obs_next = env.reset()
      
      # Record episodic reward.
      list_step.append(step)
      list_episodic_reward.append(episodic_reward)
      mean_episodic_reward = np.round(np.mean(list_episodic_reward[-last_k_episodes:]), 2)
      list_mean_episodic_reward.append(mean_episodic_reward)
      if len(list_episodic_reward) % print_frequency == 0:
        print("Episode ", str(len(list_episodic_reward)), ": step = ", step, ", mean reward = ", mean_episodic_reward, ".", sep = "")
      
      # Save the network when the mean episodic reward breaks the record.
      if step >= initial_buffer_size and len(list_episodic_reward) >= last_k_episodes:
        if highest_episodic_reward is None or mean_episodic_reward > highest_episodic_reward:
          if np.random.uniform() < save_frequency:
            model_graph.save(SAVE_DIR + file_name)
            print("Save the network as mean episodic reward increases from ", highest_episodic_reward, " to ", mean_episodic_reward, ".", sep = "")
            highest_episodic_reward = mean_episodic_reward
      
      episodic_reward = 0
    # Store data.
    data = (obs, action, reward, done, obs_next)
    replay_buffer.append(data)
    # Update observation.
    obs = obs_next
    
    # Train the agent.
    if step > initial_buffer_size and step % training_step == 0:
      # Sample training data from the replay buffer.
      batch_index, batch_data, batch_weights = replay_buffer.sample(batch_size, beta.get_value(step))
      batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
        [np.array([batch_data[j][i] for j in range(batch_size)]) for i in range(len(batch_data[0]))]
      
      # Clip the reward.
      batch_reward = np.clip(batch_reward, reward_min, reward_max)
      
      # One train step.
      td_error = model_graph.train(batch_obs, batch_action, batch_reward, batch_done, batch_obs_next, batch_weights, learning_rate.get_value(step))
      
      # Update priority for the sampled data.
      replay_buffer.update_priorities(batch_index, td_error)
  
  sess.close()
  tf.contrib.keras.backend.clear_session()
  
  total_time = int(time.time() - start_time)
  print("Training finished in ", total_time, " s.", sep = "")
  
  # Close the environment.
  env.close()
  
  # Store data in a csv file.
  record = pd.DataFrame({"Step": list_step, "Mean Episodic Reward": list_mean_episodic_reward})
  record.to_csv(CSV_DIR + file_name + ".csv", sep = ",", index = False)