Esempio n. 1
0
def get_train_step(policy_seq, V_seq, weights, replay):
    # Train via actor-critic (see here - https://www.youtube.com/watch?v=KHZVXao4qXs)

    policy_seq = T.printing.Print(">>>> policy_seq; ")(policy_seq)
    V_seq = T.printing.Print(">>>> V_seq; ")(V_seq)
    ### COMMENTED ASSERT ACTION.DIM==2 LINE IN THIS METHOD
    elwise_mse_loss = a2c.get_elementwise_objective(
        policy_seq,
        V_seq[:, :, 0],
        replay.actions[0],
        replay.rewards,
        replay.is_alive,
        gamma_or_gammas=0.99)

    # compute mean over "alive" fragments
    loss = elwise_mse_loss.sum() / replay.is_alive.sum()

    reg = T.mean((1. / policy_seq).sum(axis=-1))
    loss += 0.01 * reg

    # Compute weight updates
    updates = lasagne.updates.rmsprop(loss, weights, learning_rate=0.001)
    # updates = lasagne.updates.adam(loss, weights, learning_rate=0.001)

    # compile train function

    train_step = theano.function([], loss, updates=updates)

    return train_step
Esempio n. 2
0
def get_a2c_loss_symbolic(agent,pool,reward_koeff=1,gamma=0.99):
    
    #get agent's Qvalues obtained via experience replay
    #we don't unroll scan here and propagate automatic updates
    #to speed up compilation at a cost of runtime speed
    replay = pool.experience_replay
    _,_,_,_,(logits_seq,V_seq) = agent.get_sessions(replay,experience_replay=True)
    
    
    # compute pi(a|s) and log(pi(a|s)) manually [use logsoftmax]
    # we can't guarantee that theano optimizes logsoftmax automatically since it's still in dev
    logits_flat = logits_seq.reshape([-1,logits_seq.shape[-1]])
    policy_seq = T.nnet.softmax(logits_flat).reshape(logits_seq.shape)
    logpolicy_seq = T.nnet.logsoftmax(logits_flat).reshape(logits_seq.shape)

    # get policy gradient
    from agentnet.learning import a2c
    elwise_actor_loss,elwise_critic_loss = a2c.get_elementwise_objective(policy=logpolicy_seq,
                                                                         treat_policy_as_logpolicy=True,
                                                                         state_values=V_seq[:,:,0],
                                                                         actions=replay.actions[0],
                                                                         rewards=replay.rewards*reward_koeff,
                                                                         is_alive=replay.is_alive,
                                                                         gamma_or_gammas=gamma,
                                                                         n_steps=None,
                                                                         return_separate=True)
        
    # (you can change them more or less harmlessly, this usually just makes learning faster/slower)
    # also regularize to prioritize exploration
    reg_logits = T.mean(logits_seq**2)
    reg_entropy = T.mean(T.sum(policy_seq*logpolicy_seq,axis=-1))

    #add-up loss components with magic numbers 
    loss = 0.1*elwise_actor_loss.mean() +\
           0.25*elwise_critic_loss.mean() +\
           1e-3*reg_entropy +\
           1e-3*reg_logits

    return loss
Esempio n. 3
0
    def make_train_fun(
            self,
            agent,
            sequence_length=25,  # how many steps to make before updating weights
            observation_shape=(1, 64,
                               64),  # same as env.observation_space.shape
            reward_scale=1e-3,  #rewards are multiplied by this. May be useful if they are large.
            gamma=0.99,  #discount from TD
    ):
        """Compiles a function to train for one step"""

        #make replay environment
        observations = T.tensor(theano.config.floatX,
                                broadcastable=(False, ) *
                                (2 + len(observation_shape)),
                                name="observations[b,t,color,width,height]")

        actions = T.imatrix("actions[b,t]")
        rewards, is_alive = T.matrices("rewards[b,t]", "is_alive[b,t]")
        prev_memory = [l.input_var for l in agent.agent_states.values()]

        replay = SessionBatchEnvironment(observations, [observation_shape],
                                         actions=actions,
                                         rewards=rewards,
                                         is_alive=is_alive)

        #replay sessions
        _, _, _, _, (logits_seq, V_seq) = agent.get_sessions(
            replay,
            session_length=sequence_length,
            experience_replay=True,
            initial_hidden=prev_memory,
            unroll_scan=
            False,  #speeds up compilation 10x, slows down training by 20% (still 4x faster than TF :P )
        )
        rng_updates = agent.get_automatic_updates(
        )  #updates of random states (will be passed to a function)

        # compute pi(a|s) and log(pi(a|s)) manually [use logsoftmax]
        # we can't guarantee that theano optimizes logsoftmax automatically since it's still in dev
        logits_flat = logits_seq.reshape([-1, logits_seq.shape[-1]])
        policy_seq = T.nnet.softmax(logits_flat).reshape(logits_seq.shape)
        logpolicy_seq = T.nnet.logsoftmax(logits_flat).reshape(
            logits_seq.shape)

        # get policy gradient
        elwise_actor_loss, elwise_critic_loss = a2c.get_elementwise_objective(
            policy=logpolicy_seq,
            treat_policy_as_logpolicy=True,
            state_values=V_seq[:, :, 0],
            actions=replay.actions[0],
            rewards=replay.rewards * reward_scale,
            is_alive=replay.is_alive,
            gamma_or_gammas=gamma,
            n_steps=None,
            return_separate=True)

        # add losses with magic numbers
        # (you can change them more or less harmlessly, this usually just makes learning faster/slower)
        # also regularize to prioritize exploration
        reg_logits = T.mean(logits_seq**2)
        reg_entropy = T.mean(T.sum(policy_seq * logpolicy_seq, axis=-1))
        loss = 0.1 * elwise_actor_loss.mean() + 0.25 * elwise_critic_loss.mean(
        ) + 1e-3 * reg_entropy + 1e-2 * reg_logits

        # Compute weight updates, clip by norm
        grads = T.grad(loss, self.weights)
        grads = lasagne.updates.total_norm_constraint(grads, 10)

        updates = lasagne.updates.adam(grads, self.weights, 1e-4)

        # compile train function
        inputs = [observations, actions, rewards, is_alive] + prev_memory
        return theano.function(inputs,
                               updates=rng_updates + updates,
                               allow_input_downcast=True)
Esempio n. 4
0
def test_space_invaders(
    game_title='SpaceInvaders-v0',
    n_parallel_games=3,
    replay_seq_len=2,
):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None, ) + atari.observation_space.shape
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    window_size = 3

    # prev state input
    prev_window = InputLayer(
        (None, window_size) + tuple(observation_reshape.output_shape[1:]),
        name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    memory_dict = {window: prev_window}

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None, ) +
                                 window.output_shape[2:])

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(window_max, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    #fakes for a2c
    policy_eval = DenseLayer(nn,
                             num_units=n_actions,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             name="a2c action probas")
    state_value_eval = DenseLayer(nn,
                                  num_units=1,
                                  nonlinearity=None,
                                  name="a2c state values")
    # resolver
    resolver = ProbabilisticResolver(policy_eval, name="resolver")

    # agent
    agent = Agent(observation_layer, memory_dict,
                  (q_eval, policy_eval, state_value_eval), resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    # # Create and manage a pool of atari sessions to play with

    pool = EnvPool(agent, game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(50)

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(
            n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values, policy, etc obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        experience_replay=True,
    )
    (q_values_sequence, policy_sequence, value_sequence) = estimators

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = 0.

    #1-step algos
    for algo in qlearning, sarsa:
        elwise_mse_loss += algo.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
        )
    #qlearning_n_step
    for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1,
              None):
        elwise_mse_loss += qlearning.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
            n_steps=n)

    #a2c n_step

    elwise_mse_loss += a2c.get_elementwise_objective(policy_sequence,
                                                     value_sequence[:, :, 0],
                                                     env.actions[0],
                                                     scaled_reward_seq,
                                                     env.is_alive,
                                                     gamma_or_gammas=0.99,
                                                     n_steps=3)

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward],
                                updates=updates)
    evaluation_fun = theano.function(
        [], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " %
              (epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Esempio n. 5
0
def test_space_invaders(game_title='SpaceInvaders-v0',
                        n_parallel_games=3,
                        replay_seq_len=2,
                        ):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None,) + atari.observation_space.shape
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    window_size = 3

    # prev state input
    prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]),
                             name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    memory_dict = {window: prev_window}

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None,) + window.output_shape[2:])

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(window_max, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    #fakes for a2c
    policy_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.softmax,
                        name="a2c action probas")
    state_value_eval = DenseLayer(nn,
                        num_units=1,
                        nonlinearity=None,
                        name="a2c state values")
    # resolver
    resolver = ProbabilisticResolver(policy_eval,  name="resolver")

    # agent
    agent = Agent(observation_layer,
                  memory_dict,
                  (q_eval,policy_eval,state_value_eval), resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    # # Create and manage a pool of atari sessions to play with

    pool = EnvPool(agent,game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(50)


    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values, policy, etc obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        experience_replay=True,
    )
    (q_values_sequence,policy_sequence,value_sequence) = estimators

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = 0.
    
    #1-step algos
    for algo in qlearning,sarsa:
        elwise_mse_loss += algo.get_elementwise_objective(q_values_sequence,
                                                              env.actions[0],
                                                              scaled_reward_seq,
                                                              env.is_alive,
                                                              gamma_or_gammas=0.99, )
    #qlearning_n_step
    for n in (1,3,replay_seq_len-1, replay_seq_len, replay_seq_len+1,None):
        elwise_mse_loss += qlearning.get_elementwise_objective(q_values_sequence,
                                                              env.actions[0],
                                                              scaled_reward_seq,
                                                              env.is_alive,
                                                              gamma_or_gammas=0.99,
                                                              n_steps=n)
        
    #a2c n_step
    
    elwise_mse_loss += a2c.get_elementwise_objective(policy_sequence,
                                                            value_sequence[:,:,0],
                                                            env.actions[0],
                                                            scaled_reward_seq,
                                                            env.is_alive,
                                                            gamma_or_gammas=0.99,
                                                            n_steps=3)
    
    

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward], updates=updates)
    evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " % (
            epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))