def main():
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)
#        return U.BatchInput(env.observation_space.shape, name)

#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCar-v0")

    env = gym.make("TestRob-v0")

    #    model = models.mlp([32])
    model = models.mlp([64])
    #    model = models.mlp([16, 16])

    # parameters
    q_func = model
    lr = 1e-3
    max_timesteps = 100000
    #    max_timesteps=10000
    buffer_size = 50000
    exploration_fraction = 0.1
    #    exploration_fraction=0.3
    exploration_final_eps = 0.02
    train_freq = 1
    batch_size = 32
    print_freq = 10
    checkpoint_freq = 10000
    learning_starts = 1000
    gamma = 1.0
    target_network_update_freq = 500
    #    prioritized_replay=False
    prioritized_replay = True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    # try mountaincar w/ different input dimensions
    #    inputDims = [50,2]

    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)

        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                              weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))


#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess

    plt.plot(episode_rewards)
    plt.show()

    sess
def main():

    #    env = gym.make("CartPoleRob-v0")
    #    env = gym.make("CartPole-v0")
    #    env = gym.make("CartPole-v1")
    #    env = gym.make("Acrobot-v1")
    #    env = gym.make("MountainCarRob-v0")
    #    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    #    env = gym.make("FrozenLake8x8rob-v0")
    #    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen + 1):
            for j in range(np.shape(obses_t)[1] - windowLen + 1):
                deicticObses_t.append(obses_t[i:i + windowLen,
                                              j:j + windowLen, :])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen + 1):
                for k in range(np.shape(obses_t)[2] - windowLen + 1):
                    deicticObses_t.append(obses_t[i, j:j + windowLen,
                                                  k:k + windowLen, :])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i, j:j + windowLen,
                                                      k:k + windowLen, :])
                    deicticWeights.append(weights[i])

        return np.array(deicticObses_t), np.array(deicticActions), np.array(
            deicticObses_tp1), np.array(deicticWeights)

    # Get deictic patch and action groupings
    # input: obses_deic, actions_deic -> Nx.. a bunch of deictic patches and actions
    # output: groups -> assignment of each row in obses_deic, actions_deic to a group
#    def getDeicticGroups(obses_deic, actions_deic, max_num_groups):

    def getDeicticGroups(obses_deic, max_num_groups):

        # create groups of equal obs/actions
        shape = np.shape(obses_deic)
        obses_deic_flat = np.reshape(obses_deic,
                                     [shape[0], shape[1] * shape[2]])
        _, group_matching, group_counts = np.unique(obses_deic_flat,
                                                    axis=0,
                                                    return_inverse=True,
                                                    return_counts=True)

        #        obses_actions_deic_flat = np.c_[obses_deic_flat,actions_deic]
        #        _, group_matching, group_counts = np.unique(obses_actions_deic_flat,axis=0,return_inverse=True,return_counts=True)

        #        # take max_num_groups of most frequent groups
        #        group_indices = np.float32(np.r_[np.array([group_counts]),np.array([range(np.shape(group_counts)[0])])])
        #        group_indices[0] = group_indices[0] + np.random.random(np.shape(group_indices)[1])*0.1 # add small random values to randomize sort order for equal numbers
        #        group_indices_sorted = group_indices[:,group_indices[0,:].argsort()]
        #        group_indices_to_keep = np.int32(group_indices_sorted[1,-max_num_groups:])
        #
        #        # Replace group numbers with new numbers in 0:max_num_groups
        #        # All elts with group=max_num_groups have no group.
        #        new_group_matching = np.ones(np.shape(group_matching)[0])*max_num_groups
        #        for i in range(np.shape(group_indices_to_keep)[0]):
        #            idx = np.nonzero(group_matching == group_indices_to_keep[i])
        #            new_group_matching[idx] = i
        #
        #        # Get final list of groups. Get observations, actions corresponding to each group
        #        groups,idx = np.unique(new_group_matching,return_index=True)
        #        groups_idx = np.r_[np.array([groups]),np.array([idx])]
        #        groups_idx_sorted = groups_idx[:,groups_idx[0].argsort()]
        #        groups = groups_idx_sorted[0]
        #        idx = np.int32(groups_idx_sorted[1,:-1])
        #        group_obs = obses_deic_flat[idx]
        #        group_actions = actions_deic[idx]
        #
        #        # reshape output observations
        #        obsshape = np.shape(group_obs)
        #        group_obs = np.reshape(group_obs,(obsshape[0],np.int32(np.sqrt(obsshape[1])),np.int32(np.sqrt(obsshape[1])),1))

        # Get final list of groups. Get observations, actions corresponding to each group
        groups, idx = np.unique(group_matching, return_index=True)
        group_obs = obses_deic_flat[idx]

        # reshape output observations
        obsshape = np.shape(group_obs)
        group_obs = np.reshape(group_obs,
                               (obsshape[0], shape[1], shape[2], shape[3]))

        #        return new_group_matching, group_obs, group_actions
        #        return group_matching, group_obs
        return group_matching

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
        #        hiddens=[256],  # used in pong
        #        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        #        convs=[(8,3,1)], # used for deictic TestRob3-v0
        convs=[(16, 3, 1)],  # used for deictic TestRob3-v0
        #        convs=[(4,3,1)], # used for deictic TestRob3-v0
        #        convs=[(16,3,1)], # used for deictic TestRob3-v0
        #        convs=[(8,2,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True)

    #    model = models.mlp([6])

    # parameters
    q_func = model
    lr = 1e-3
    #    lr=1e-4
    #    max_timesteps=100000
    #    max_timesteps=50000
    max_timesteps = 20000
    buffer_size = 50000
    #    exploration_fraction=0.1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    #    exploration_final_eps=0.005
    #    exploration_final_eps=0.1
    print_freq = 10
    checkpoint_freq = 10000
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    prioritized_replay = False
    #    prioritized_replay=True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    batch_size=32
    #    train_freq=1
    batch_size = 64
    train_freq = 2
    #    batch_size=128
    #    train_freq=4
    #    batch_size=256
    #    train_freq=4
    #    batch_size=512
    #    train_freq=8
    #    batch_size=1024
    #    train_freq=8
    #    batch_size=2048
    #    train_freq=8
    #    batch_size=4096
    #    train_freq=8

    max_num_groups = 600

    # deicticShape must be square.
    # These two parameters need to be consistent w/ each other.
    #    deicticShape = (2,2,1)
    #    num_deictic_patches=36
    deicticShape = (3, 3, 1)
    num_deictic_patches = 36
    #    deicticShape = (4,4,1)
    #    num_deictic_patches=25
    #    deicticShape = (5,5,1)
    #    num_deictic_patches=16
    #    deicticShape = (6,6,1)
    #    num_deictic_patches=9
    #    deicticShape = (7,7,1)
    #    num_deictic_patches=4
    #    deicticShape = (8,8,1)
    #    num_deictic_patches=1

    num_actions = 4
    tabularQ = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])
    OHEnc = np.identity(max_num_groups)

    def make_obs_ph(name):
        #        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size * 25, )

    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    def parallelUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic,
                       group_matching, dones, q_tp1, batch_size,
                       num_deictic_patches, max_num_groups):
        q_tp1_target = rewards + gamma * np.max(
            np.reshape(np.max(q_tp1, 1), [batch_size, num_deictic_patches]), 1)
        q_tp1_target = (1 - dones) * q_tp1_target

        group_matching_onehot = OHEnc[group_matching]
        desc_2_state = np.max(
            np.reshape(group_matching_onehot,
                       [batch_size, num_deictic_patches, max_num_groups]), 1)

        max_target = np.max(q_tp1_target)
        target_min_per_D = np.min(
            desc_2_state * np.tile(np.reshape(q_tp1_target, [batch_size, 1]),
                                   [1, max_num_groups]) +
            (1 - desc_2_state) * max_target, 0)

        # I noticed that the line below produces unpredictable behavior. The dotprod does not seem to produce consistent results for some reason. Use the line below that instead.
        #        targets1 = np.dot(group_matching_onehot,target_min_per_D)
        targets = np.sum(
            group_matching_onehot *
            np.tile(np.reshape(target_min_per_D, [1, max_num_groups]),
                    [batch_size * num_deictic_patches, 1]), 1)

        D_2_DI = group_matching_onehot

        return q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets

    sess = U.make_session(num_cpu)
    sess.__enter__()

    #    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
    #    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min_streamlined(
    #    getq, trainWOUpdate = build_graph.build_train_deictic_min_streamlined(
    getq, train, trainWOUpdate, update_target = build_graph.build_train_deictic_min_streamlined(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        batch_size=batch_size,
        num_deictic_patches=num_deictic_patches,
        max_num_groups=max_num_groups,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        double_q=False)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()

    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # get action to take
        #        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        #        qvalues = getq(np.array(obs)[None])
        #        action = np.argmax(qvalues)
        #        if np.random.rand() < exploration.value(t):
        #            action = np.random.randint(env.action_space.n)

        deicticObs = getDeicticObs(obs, deicticShape[0])
        #        qvalues = getq(np.array(deicticObs))
        stateCurr = convertState(deicticObs)
        qvalues = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2],
                           stateCurr[3], :]
        action = np.argmax(np.max(qvalues, 0))
        selPatch = np.argmax(np.max(qvalues, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
#        env.render()

        new_obs, rew, done, _ = env.step(action)

        # display state, action, nextstate
        if t > 20000:
            toDisplay = np.reshape(new_obs, (8, 8))
            toDisplay[
                np.
                int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))),
                np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches))
                         )] = 50
            print(
                "Current/next state. 50 denotes the upper left corner of the deictic patch."
            )
            print(str(toDisplay))

#        env.render()

# Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
            if t > 20000:
                print("q-values:")
                print(str(qvalues))
                print("*** Episode over! ***\n\n")

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(
                obses_t, actions, obses_tp1, weights, deicticShape[0])
            group_matching = getDeicticGroups(obses_t_deic, max_num_groups)

            stateCurr = convertState(obses_t_deic)
            stateNext = convertState(obses_tp1_deic)
            q_tp1 = tabularQ[stateNext[0], stateNext[1], stateNext[2],
                             stateNext[3], :]
            #            q_tp1_target_parallel, desc_2_state_parallel, target_min_per_D_parallel, D_2_DI_parallel, targets_parallel = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1)
            q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets = parallelUpdate(
                obses_t_deic, actions_deic, rewards, obses_tp1_deic,
                group_matching, dones, q_tp1, batch_size, num_deictic_patches,
                max_num_groups)
            targets_simple = np.reshape(
                np.tile(np.reshape(q_tp1_target, [batch_size, 1]),
                        [1, num_deictic_patches]),
                batch_size * num_deictic_patches)

            tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],
                     actions_deic] = np.minimum(
                         targets_simple,
                         tabularQ[stateCurr[0], stateCurr[1], stateCurr[2],
                                  stateCurr[3], actions_deic])

#            print("Num unique descriptors in batch: " + str(np.shape(np.unique(group_matching))[0]))

#

#            for i in range(np.shape(obses_t_deic_small)[0]):
#                if i in agent_pos[0]:
#
#                    ax = agent_pos[np.nonzero(agent_pos[0] == i)[0][0]]
#                    ax

#            if prioritized_replay:
#                new_priorities = np.abs(td_errors) + prioritized_replay_eps
#                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))
            print("best patch:\n" +
                  str(np.squeeze(deicticObs[np.argmax(np.max(qvalues, 1))])))
            print("worst patch:\n" +
                  str(np.squeeze(deicticObs[np.argmin(np.max(qvalues, 1))])))


#            if t > learning_starts:
#                print("max td_error: " + str(np.sort(td_error)[-10:]))

    num2avg = 20
    rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg
    plt.plot(rListAvg)
    #    plt.plot(episode_rewards)
    plt.show()

    sess
Esempio n. 3
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 40000
    buffer_size = 50000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 64
    train_freq = 2

    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 3
    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[1:3, 1:3]),
                                                    (k in patch[1:3, 3:5]),
                                                    (k in patch[1:3, 5:7])],
                                                   [(k in patch[3:5, 1:3]),
                                                    (k in patch[3:5, 3:5]),
                                                    (k in patch[3:5, 5:7])],
                                                   [(k in patch[5:7, 1:3]),
                                                    (k in patch[5:7, 3:5]),
                                                    (k in patch[5:7, 5:7])]]
# THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
#                    deicticObsThis[:,:,k+1] = [[(k in patch[0:3,0:3]), (k in patch[0:3,3:6]), (k in patch[0:3,6:9])],
#                                 [(k in patch[3:6,0:3]), (k in patch[3:6,3:6]), (k in patch[3:6,6:9])],
#                                 [(k in patch[6:9,0:3]), (k in patch[6:9,3:6]), (k in patch[6:9,6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):

        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36, 9, 4))
        for i in range(4):
            obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
        state_numeric = 9 * np.ones(
            (4, shape[0])
        )  # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2] == i)[0]
            state_numeric[i, pos[0][idx]] = pos[1][idx]
#            state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i]

        return np.int32(state_numeric)

    dimSize = deicticShape[0] * deicticShape[1] + 1
    #    tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ1 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ2 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ3 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ4 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ5 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])

    obs = env.reset()
    #    OHEnc = np.identity(max_num_groups)

    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)

        #        # do a couple of spot checks to verify that obsDeictic is correct
        #        num2check = 17
        #        print(str(obsDeictic[num2check,:,:,0] + obsDeictic[num2check,:,:,1]))
        #        print(str(obsDeictic[num2check,:,:,2] + obsDeictic[num2check,:,:,3]))

        qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2],
                          stateCurr[3], :]
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]

        # select action
        qCurrNoise = qCurr + np.random.random(
        ) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

#        env.render()
#        print("action: " + str(action))

# take action
        new_obs, rew, done, _ = env.step(action)
        new_obs

        #        if done == 1:
        #            print("action: " + str(action) + ", patch: " + str(selPatch) + ", reward: " + str(rew))
        #            action

        if t > max_timesteps * 1.05:
            print("obs:\n" + str(np.squeeze(obs)))
            print("qCurr:\n" + str(qCurr))
            print("action: " + str(action) + ", patch: " + str(selPatch))
            print("close:\n" + str(obsDeictic[selPatch, :, :, 0] +
                                   obsDeictic[selPatch, :, :, 1]))
            print("far:\n" + str(obsDeictic[selPatch, :, :, 2] +
                                 obsDeictic[selPatch, :, :, 3]))
            action

        # get next q-values
        stateNext = convertState(getDeicticObs(new_obs))
        qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2],
                           stateNext[3], :]
        #        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]

        # perform learning update
        #        qNextmaxa = np.max(qNext5,1)
        qNextmaxa = np.max(qNext5)
        targets = rew + (1 - done) * gamma * qNextmaxa

        #        max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
        #        if max_negative_td_error > 5:
        #            max_negative_td_error
        #        print("max_td_error: " + str(max_negative_td_error))
        #        print("curr tabularQ:\n" + str(tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
        #        print("targets:\n" + str(targets))
        #        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])

        target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        targets1 = targets
        targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]
        targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]
        targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]
        targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]


        tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets1
        tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets2
        tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets3
        tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets4
        tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets5

        #        # Store transition in the replay buffer.
        #        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            #            print("************************* Episode done! **************************")
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))

        obs = new_obs

        # stop at the end of training
        if t > max_timesteps * 1.1:
            #            np.set_printoptions(precision=1)
            #            np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
            np.set_printoptions(formatter={'float_kind': lambda x: "%.1f" % x})

            #            qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
            #            qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
            #            qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
            #            qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
            #            qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
            #            todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape]
            #            todisplay = np.c_[qCurr5,np.transpose(stateCurr)]

            print("obs:\n" + str(np.squeeze(obs)))

            #            todisplay = np.c_[np.max(qCurr5,1),np.transpose(stateCurr)]
            #            print("q-values:\n" + str(todisplay))
            #
            #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
            #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
            #            print("action: " + str(action) + ", patch: " + str(selPatch))
            action


#                print("obs:\n" + str(np.squeeze(obs)))
#                print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3))))
#                print("action: " + str(action) + ", patch: " + str(selPatch))
#                t

    t
Esempio n. 4
0
def learn(env,
          q_func,
          lr=1e-2,
          max_timesteps=1000000,
          buffer_size=50000,
          exploration_fraction=1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    #exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
    #                            initial_p=0.7,
    #                            final_p=0.15)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                #logger.record_tabular("replay buffer size",  replay_buffer.__len__())
                logger.dump_tabular()

            #if done and num_episodes % 100 == 1:
            #    filehandler = open("cartpole_MDP_replay_buffer.obj","wb")
            #    pickle.dump(replay_buffer,filehandler)
            #    filehandler.close()
            #    print('MDP model samples saved',replay_buffer.__len__())

            #    file = open("cartpole_MDP_replay_buffer.obj",'rb')
            #    reloaded_replay_buffer = pickle.load(file)
            #    file.close()
            #    print('MDP model samples loaded',reloaded_replay_buffer.__len__())

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_state(model_file)

    #file = open("cartpole_MDP_replay_buffer.obj",'rb')
    #reloaded_replay_buffer = pickle.load(file)
    #file.close()
    #reloaded_replay_buffer.__len__()
    filehandler = open("cartpole_MDP_replay_buffer.obj", "wb")
    pickle.dump(replay_buffer, filehandler)
    filehandler.close()
    print('MDP model samples saved', replay_buffer.__len__())

    file = open("cartpole_MDP_replay_buffer.obj", 'rb')
    reloaded_replay_buffer = pickle.load(file)
    file.close()
    print('MDP model samples loaded', reloaded_replay_buffer.__len__())
    return act
def main():

    env = envstandalone.MultiGhostEvade()
#    env = envstandalone.GhostEvade()
#    env = envstandalone.BallCatch()
    
    max_timesteps=40000
    learning_starts=1000
    buffer_size=50000
#    exploration_fraction=0.2
    exploration_fraction=0.4
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=1

    obsShape = (8,8,1)
#    obsShape = (8,8,2)
#    deicticShape = (3,3,2)
#    deicticShape = (3,3,4)
#    deicticShape = (4,4,2)
#    deicticShape = (4,4,4)
    deicticShape = (8,8,2)
#    num_deictic_patches = 36
#    num_deictic_patches = 25
    num_deictic_patches = 1

#    num_actions = 4
#    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu=16
    num_cascade = 5
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)


    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#    model = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1)],
        convs=[(32,3,1)],
#        convs=[(32,4,1)],
#        convs=[(16,4,1)],
        hiddens=[16],
        dueling=True
    )
    
    # MLP version
#    model = models.mlp([8, 16])
#    model = models.mlp([16, 16])
#    model = models.mlp([16, 32])
#    model = models.mlp([16, 16])
#    model = models.mlp([32, 32])

    q_func=model
    lr=0.001
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)
    
#    def make_obsDeic_ph(name):
#        return U.BatchInput(deicticShape, name=name)
        
    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq_DQN(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=num_actions
            )
        
    targetTrain = build_targetTrain_DQN(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr)
    )

    get_2channelobs = build_get_2channelobs(make_obs_ph=make_obs_ph)
    
#    getq = build_getq(
#            make_obsDeic_ph=make_obsDeic_ph,
#            q_func=q_func,
#            num_actions=num_actions,
#            num_cascade=num_cascade,
#            scope="deepq",
#            qscope="q_func"
#            )
#    
#    getqTarget = build_getq(
#            make_obsDeic_ph=make_obsDeic_ph,
#            q_func=q_func,
#            num_actions=num_actions,
#            num_cascade=num_cascade,
#            scope="deepq",
#            qscope="q_func_target"
#            )
#
#    update_target = build_update_target(scope="deepq", 
#                                        qscope="q_func",
#                                        qscopeTarget="q_func_target")
#                      
#    targetTrain = build_targetTrain(
#        make_obsDeic_ph=make_obsDeic_ph,
#        make_target_ph=make_target_ph,
#        q_func=q_func,
#        num_actions=env.action_space.n,
#        num_cascade=num_cascade,
#        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
#        scope="deepq", 
#        qscope="q_func"
#    )
#    
#    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
##    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    
    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):
        
#        obs2channel = get_2channelobs([obs])
        
        # CNN version
        qCurr = getq(np.array([obs]))
#        qCurr = getq(np.array(obs2channel))
        
#        # MLP version
#        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise,1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
            actions = np.int32(np.reshape(actions,[batch_size,]))
            
#            # Put observations in deictic form
#            obses_t_deic = getDeic(obses_t)
#            obses_tp1_deic = getDeic(obses_tp1)
#            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
#            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]
#            
#            # Reshape everything to (1152,) form
#            donesTiled = np.repeat(dones,num_deictic_patches)
#            rewardsTiled = np.repeat(rewards,num_deictic_patches)
#            actionsTiled = np.repeat(actions,num_deictic_patches)
            
            # Get curr, next values: CNN version
#            qNextTarget = getqTarget(obses_tp1_deic)
#            qNext = getq(obses_tp1_deic)
#            qCurr = getq(obses_t_deic)
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

#            # Get curr, next values: MLP version
#            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
#            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext,1) # standard
#            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
#            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]
            
#            # This version takes the max over all glimpses
#            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
#            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
#            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax
            targets = rewards + (1-dones) * gamma * qNextmax

#            # Take min over targets in same group
#            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
#            for i in range(np.shape(uniqueCounts)[0]):
#                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])
            
            
#            qCurrTargets = np.copy(qCurr)
#            qCurrTargets[:,np.int32(actions)] = targets
            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:,i] = myActions * targets + (1 - myActions) * qCurr[:,i]
            
#            # Copy into cascade with pruning.
#            qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets
#            for i in range(num_cascade-1):
#                mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled]
#                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
#                    mask*targets + \
#                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]
            
            # CNN version
            td_error_out = targetTrain(
                    obses_t,
                    qCurrTargets
                    )
#                    obses_t_deic,
            
#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )
                
#        # Update target network periodically.
#        if t > learning_starts and t % target_network_update_freq == 0:
#            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Esempio n. 6
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 20000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,4)
    #    num_deictic_patches=36

    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        #        return U.BatchInput(deicticShape, name=name)
        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current q-values: neural network version
        qCurr = getq(np.array([obs]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise, 1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            actions = np.int32(np.reshape(actions, [
                batch_size,
            ]))

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

            # Get targets
            qNextmax = np.max(qNext, 1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, i]

            # Update values: neural network version
            td_error_out, obses_out, targets_out = targetTrain(
                obses_t, qCurrTargets)

            td_error_pre = qCurr[range(batch_size), actions] - targets

            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            # neural network version
            qCurr = getq(obses_t)

            td_error_post = qCurr[range(batch_size), actions] - targets


#            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        #        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([
            q_func[x] if x in q_func else 10 * np.ones(num_states)
            for x in keys
        ])

    def trainTabular(vectorKey, qCurrTargets, weights):
        keys = getTabularKeys(vectorKey)
        alpha = 0.2
        for i in range(len(keys)):
            if keys[i] in q_func:
                #                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func[keys[i]] = q_func[keys[i]] + alpha * weights[i, :] * (
                    qCurrTargets[i] - q_func[keys[i]]
                )  # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
    max_timesteps = 40000
    learning_starts = 1000
    #    learning_starts=10
    #    buffer_size=50000
    buffer_size = 10000
    #    buffer_size=1000
    #    buffer_size=320
    #    buffer_size=32
    #    buffer_size=8
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    target_network_update_freq = 1
    batch_size = 32
    #    batch_size=1
    train_freq = 1
    #    train_freq=2
    num_cpu = 16
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay = True
    #    prioritized_replay=False
    #    prioritized_replay_alpha=1.0
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    #    prioritized_replay_beta_iters=None
    prioritized_replay_beta_iters = 20000
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3, 3, 4)
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        actionsPickDescriptors = np.concatenate(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]
        actionDescriptors = np.reshape(actionDescriptors, [
            -1, deicticActionShape[0] * deicticActionShape[1] *
            deicticActionShape[2]
        ]) == 1

        # Get q-values
        qCurr = getTabular(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, stateDeictic])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)


#                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
#                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                #                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            actionsPickDescriptorsNext1 = np.concatenate([
                moveDescriptorsNext1,
                np.zeros(np.shape(moveDescriptorsNext1))
            ],
                                                         axis=3)
            actionsPlaceDescriptorsNext1 = np.concatenate([
                np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1
            ],
                                                          axis=3)
            actionDescriptorsNext1 = np.stack(
                [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1],
                axis=0)
            actionDescriptorsNextFlat1 = np.reshape(
                actionDescriptorsNext1,
                [batch_size * num_patches * num_actions_discrete, -1]) == 1

            qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            qNext1 = np.reshape(
                qNextFlat1,
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax1 = np.max(
                np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1)
            targets1 = rewards + (1 - dones) * gamma * qNextmax1

            qCurrTarget1 = getTabular(actions)
            td_errors = qCurrTarget1[range(batch_size), states_t] - targets1
            qCurrTarget1[range(batch_size), states_t] = targets1
            #            trainTabular(actions, qCurrTarget1)
            trainTabular(actions, qCurrTarget1,
                         np.transpose(np.tile(weights, [num_states, 1])))

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", beta: " +
                  str(beta) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    buffer_size = 50000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 64
    train_freq = 2

    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    tabularQ1 = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])
    tabularQ2 = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])
    tabularQ3 = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])
    tabularQ4 = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])
    tabularQ5 = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    obs = env.reset()
    #    OHEnc = np.identity(max_num_groups)

    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
        qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2],
                          stateCurr[3], :]

        # select action
        action = np.argmax(np.max(qCurr, 0))
        selPatch = np.argmax(np.max(qCurr, 1))
        if np.random.rand() < exploration.value(t):
            #            print("Random action!")
            action = np.random.randint(env.action_space.n)

#        if t > max_timesteps * 0.75:
#            print("obs:\n" + str(np.squeeze(obs)))
#            print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3))))
#            print("action: " + str(action) + ", patch: " + str(selPatch))

# take action
        new_obs, rew, done, _ = env.step(action)

        # get next q-values
        stateNext = convertState(getDeicticObs(new_obs))
        qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2],
                           stateNext[3], :]

        # same-patch next state (this seems to be better)
        qNextmaxa = np.max(qNext5, 1)

        #        # any-patch next state (this seems to be worse)
        #        qNextmaxa = np.repeat(np.max(qNext5),num_deictic_patches)

        targets = rew + (1 - done) * gamma * qNextmaxa

        #        max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
        #        if max_negative_td_error > 5:
        #            max_negative_td_error
        #        print("max_td_error: " + str(max_negative_td_error))
        #        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])

        target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1],
                                           stateCurr[2], stateCurr[3], action]
        targets1 = targets
        targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]
        targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]
        targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]
        targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[
            stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]


        tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets1
        tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets2
        tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets3
        tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets4
        tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets5

        #        # Store transition in the replay buffer.
        #        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            #            print("************************* Episode done! **************************")
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) +
                  ", max q at curr state: " + str(np.max(qCurr)))


#            # stop at the end of training
#            if t > max_timesteps * 0.75:
#                np.set_printoptions(precision=1)
#
#                obsDeicticReshape = np.reshape(obsDeictic,[36,9])
#                qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#                qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#                qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#                qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#                qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#                todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape]
#                print("q-values:\n" + str(todisplay))

#                print("obs:\n" + str(np.squeeze(obs)))
#                print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3))))
#                print("action: " + str(action) + ", patch: " + str(selPatch))
#                t

# *************************************
# *************************************
#        to do: set break point when there is a decrease in value and study that situation...
# I noticed the deitic representations are wierd when 10 and 20 are vertically separated by one empty row...
# env.step came back w/ rew=1 and done=true. that shouldn't happen!
# *************************************
# *************************************

        obs = new_obs

    t
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})
    
    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
#        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
    
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    # Standard DQN parameters
#    max_timesteps=20000
    max_timesteps=30000
#    max_timesteps=2000
#    learning_starts=1000
    learning_starts=10
#    buffer_size=50000
#    buffer_size=10000
#    buffer_size=1000
#    buffer_size=320
#    buffer_size=32
#    buffer_size=8
    buffer_size=1
#    exploration_fraction=0.2
    exploration_fraction=0.3
#    exploration_final_eps=0.02
    exploration_final_eps=0.1
    print_freq=1
#    gamma=.98
    gamma=.9
    target_network_update_freq=1
#    batch_size=32
    batch_size=1
    train_freq=1
#    train_freq=2
    num_cpu = 16
#    lr=0.001
    lr=0.0003
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1
    
    # Deictic state/action parameters
    deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3,3,2)
    num_cascade = 5
#    num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything
    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected
    
    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(32,3,1)],
        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_states], name=name)
#        return U.BatchInput([num_states], name=name)
        return U.BatchInput([1], name=name)
#        return U.BatchInput(1, name=name)

    def make_weight_ph(name):
#        return U.BatchInput([num_states], name=name)
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=num_cascade,
                scope="deepq",
                qscope="q_func_notholding"
                )
        getqHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=num_cascade,
                scope="deepq",
                qscope="q_func_holding"
                )
    
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=num_cascade,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_notholding",
            grad_norm_clipping=1.
        )

        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=num_cascade,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_holding",
            grad_norm_clipping=1.
        )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = np.int32(obs[1]>0) # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
        moveDescriptors = np.int32(moveDescriptorsRaw>0)
        moveDescriptors = moveDescriptors*2-1

        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurrNotHolding = getqNotHolding(actionDescriptors)
            qCurrHolding = getqHolding(actionDescriptors)
            qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly

        # select action at random
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,stateDeictic])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,stateDeictic])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # display state at the end
        if t > max_timesteps-200:
            print(str(obs[0][:,:,0]))
            print(str(obs[1]))
            print("action: " + str(action))

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        # display state at the end
        if (t > max_timesteps-200) and done:
            print("done *********************** done")
            
        replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            states_tp1 = np.int32(states_tp1>0)
            
            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0)
            moveDescriptorsNext1 = moveDescriptorsNext1*2-1

            actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3)
            actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3)
            actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0)
            actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]])
            
            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1
                qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            else:
                qNextNotHolding = getqNotHolding(actionDescriptorsNext1)
                qNextHolding = getqHolding(actionDescriptorsNext1)
                qNextFlat1 = np.concatenate([qNextNotHolding,qNextHolding],axis=1)
            
            qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1)
            targets1 = rewards + (1-dones) * gamma * qNextmax1

            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actions,[batch_size,-1]) == 1
                qCurrTarget1 = getTabular(actionsFlat)
            else:
                qCurrTargetNotHolding = getqNotHolding(actions)
                qCurrTargetHolding = getqHolding(actions)
                qCurrTarget1 = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)
#                qCurrTarget1 = getq(actions)

            td_errors = qCurrTarget1[range(batch_size),states_t] - targets1
            qCurrTarget1[range(batch_size),states_t] = targets1

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR)
            else:
#                targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN)
                targetTrainNotHolding(actions, np.reshape(qCurrTarget1[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
                targetTrainHolding(actions, np.reshape(qCurrTarget1[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))
#                targetTrainNotHolding(actions, qCurrTarget1[:,0], np.transpose(np.tile(weights,[num_states,1]))) # (DQN)
#                targetTrainHolding(actions, qCurrTarget1[:,1], np.transpose(np.tile(weights,[num_states,1]))) # (DQN)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
        
    # display value function
    obs = env.reset()
    moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
    moveDescriptors = np.int32(moveDescriptorsRaw>0)
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
#    qPick = getq(actionsPickDescriptors)
    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    
#    qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

#    qPlace = getq(actionsPlaceDescriptors)
    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)    
#    qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
def main():

    # ********* Commonly used options. *************
    buffer_size = 1000
    batch_size = 32
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
    prioritized_replay = True
    #    prioritized_replay=False
    # ********* *********************** *************

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    obs_space = np.int32(
        [np.sqrt(env.observation_space.n),
         np.sqrt(env.observation_space.n)])

    # Dictionary-based value function
    q_func_tabular = {}
    defaultQValue = np.ones(env.action_space.n)

    # Given an integer, return the corresponding boolean array
    def getBoolBits(state):
        return np.unpackbits(np.uint8(state), axis=1) == 1

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else defaultQValue
            for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.1
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[
                    keys[i]] + alpha * qCurrTargets[i]
#                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

#    max_timesteps=100000

    max_timesteps = 30000
    exploration_fraction = 0.3
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Set up replay buffer
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    q_func = models.cnn_to_mlp(convs=[(16, 4, 1)], hiddens=[32], dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(obs_space, name=name)

    def make_target_ph(name):
        return U.BatchInput([env.action_space.n], name=name)

    def make_weight_ph(name):
        return U.BatchInput([env.action_space.n], name=name)

    if valueFunctionType == 'DQN':
        getq = build_getq(make_obs_ph=make_obs_ph,
                          q_func=q_func,
                          num_actions=env.action_space.n,
                          scope="deepq")

        targetTrain = build_targetTrain(
            make_obs_ph=make_obs_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func",
            grad_norm_clipping=1.)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    state = env.reset()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):

        if valueFunctionType == "TABULAR":
            qCurr = getTabular(getBoolBits([[state]]))
        else:
            qCurr = getq(
                np.reshape(
                    np.eye(16)[state, :], [1, obs_space[0], obs_space[1]]))

        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly

        # select action at random
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        nextState, rew, done, _ = env.step(action)

        #        replay_buffer.add(state, action, rew, nextState, float(done))
        replay_buffer.add(np.copy(state), np.copy(action), np.copy(rew),
                          np.copy(nextState), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actions, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            if valueFunctionType == "TABULAR":
                qNext = getTabular(
                    getBoolBits(np.reshape(states_tp1, [batch_size, 1])))
            else:
                qNext = getq(
                    np.reshape(
                        np.eye(16)[states_tp1, :],
                        [batch_size, obs_space[0], obs_space[1]]))

            qNextmax = np.max(qNext, axis=1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            if valueFunctionType == "TABULAR":
                qCurrTarget = getTabular(
                    getBoolBits(np.reshape(states_t, [batch_size, 1])))
            else:
                qCurrTarget = getq(
                    np.reshape(
                        np.eye(16)[states_t, :],
                        [batch_size, obs_space[0], obs_space[1]]))

            td_error = qCurrTarget[range(batch_size), actions] - targets
            qCurrTarget[range(batch_size), actions] = targets

            if valueFunctionType == "TABULAR":
                trainTabular(
                    getBoolBits(np.reshape(states_t, [batch_size, 1])),
                    qCurrTarget)
            else:
                targetTrain(
                    np.reshape(
                        np.eye(16)[states_t, :],
                        [batch_size, obs_space[0], obs_space[1]]), qCurrTarget,
                    np.tile(np.reshape(weights, [batch_size, 1]),
                            env.action_space.n))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)


#        qNext = getTabular(getBoolBits(nextState))
#
#        # Calculate TD target
#        qNextmax = np.max(qNext)
#        target = rew + (1-done) * gamma * qNextmax
#
#
#        # Update value function
#        qCurrTarget = qCurr
#        qCurrTarget[0][action] = target
#        trainTabular(getBoolBits(state),qCurrTarget)

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        state = np.copy(nextState)
Esempio n. 11
0
class PDQFDLearner(ActorLearner):
    def __init__(self, network_creator, environment_creator, args,
                 sim_coordinator):
        super(PDQFDLearner, self).__init__(network_creator,
                                           environment_creator, args)
        self.sim_coordinator = sim_coordinator
        self.evaluate = args.evaluate
        self.eva_env = None
        self.game = args.game
        self.double_q = args.double_q
        self.continuous_target_update = args.continuous_target_update
        self.stochastic = args.stochastic
        #self.exp_epsilon = LinearSchedule(args.max_global_steps,
        #                           initial_p=args.exp_epsilon,
        #                           final_p=0.0)
        #self.exp_epsilon = PiecewiseSchedule([(0, args.exp_epsilon), (round(args.max_global_steps/3), 0.3), (round(2*args.max_global_steps/3), 0.01)], outside_value=0.001)
        self.exp_epsilon = PiecewiseSchedule(eval(args.exp_eps_segments)[0],
                                             outside_value=eval(
                                                 args.exp_eps_segments)[1])
        self.initial_random_steps = args.initial_random_steps
        self.n_emulators = self.n_emulator_runners * self.n_emulators_per_emulator_runner

        # Replay buffer
        self.use_exp_replay = args.use_exp_replay
        self.n_trajectories = round(1.0 * int(args.batch_size) / self.n_steps)
        self.replay_buffer_size = args.replay_buffer_size
        if self.use_exp_replay:
            # Create replay buffer
            self.prioritized = args.prioritized
            if self.prioritized:
                self.prioritized_alpha = args.prioritized_alpha
                self.prioritized_beta0 = args.prioritized_beta0
                self.prioritized_eps = args.prioritized_eps
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.replay_buffer_size,
                    self.state_shape,
                    self.prioritized_alpha,
                    self.n_trajectories,
                    self.n_steps,
                    n_emus=self.n_emulators)
                self.beta_schedule = LinearSchedule(
                    self.max_global_steps,
                    initial_p=self.prioritized_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.replay_buffer_size,
                                                  self.state_shape,
                                                  self.n_trajectories,
                                                  self.n_steps,
                                                  n_emus=self.n_emulators)

        # Buffers to keep track of the last n_steps visited states
        masked_state = np.ones((self.n_emulators, 1) +
                               self.state_shape) * MASK_VALUE
        self.states_buffer = deque([masked_state for _ in range(self.n_steps)],
                                   self.n_steps)

        self.summaries_op = tf.summary.merge_all()
        self.counter = 0

    # The input is a tuple where each element is an array of shape (n_trajectories, n_steps) + s_shape
    # The output array has shape (n_steps * n_trajectories, n_steps) + s_shape. That is, for each trajectory,
    # each time step t in the input array is transformed into a sequence of 0 to t steps from the input array
    # and t to n masked steps.
    def prepare_input_batch(self, states):
        def masks(batch_len):
            return np.ones((batch_len, self.n_steps) +
                           self.state_shape) * MASK_VALUE

        s_in_batch = []
        for k in range(len(states)):
            s_in_traj = masks(self.n_steps)
            for i in range(self.n_steps):
                t = np.arange(min(self.n_steps, i + 1))
                s_in_traj[i, self.n_steps - 1 - t] = states[k, [i - t], :]
            s_in_batch.append(s_in_traj)

        return np.vstack(s_in_batch)

    @staticmethod
    def choose_next_actions(network, num_actions, states, session, eps,
                            stochastic):
        network_output_q = session.run(network.output_layer_q,
                                       feed_dict={network.input_ph: states})

        deterministic_actions = np.argmax(network_output_q, axis=1)
        if stochastic:
            batch_size = network_output_q.shape[0]
            random_actions = np.random.randint(low=0,
                                               high=num_actions,
                                               size=batch_size)
            choose_random = np.random.uniform(
                low=0.0, high=1.0, size=batch_size) < eps
            stochastic_actions = np.where(choose_random, random_actions,
                                          deterministic_actions)
            action_indices = stochastic_actions
        else:
            action_indices = deterministic_actions

        return action_indices

    def __choose_next_actions(self):
        eps = self.exp_epsilon.value(self.global_step)
        states = np.concatenate(self.states_buffer, axis=1)
        return PDQFDLearner.choose_next_actions(self.network, self.num_actions,
                                                states, self.session, eps,
                                                self.stochastic)

    @staticmethod
    def get_target_maxq_values(target_network,
                               next_states,
                               session,
                               double_q=True,
                               learning_network=None):
        if double_q:
            [target_network_q, learning_network_q] = session.run(
                [
                    target_network.output_layer_q,
                    learning_network.output_layer_q
                ],
                feed_dict={
                    target_network.input_ph: next_states,
                    learning_network.input_ph: next_states
                })
            idx_best_action_from_learning_network = np.argmax(
                learning_network_q, axis=1)
            maxq_values = target_network_q[
                range(target_network_q.shape[0]),
                idx_best_action_from_learning_network]
        else:
            target_network_q = session.run(target_network.output_layer_q,
                                           feed_dict={
                                               target_network.input_ph:
                                               next_states,
                                               learning_network.input_ph:
                                               next_states
                                           })
            maxq_values = target_network_q.max(axis=-1)

        return maxq_values

    def __get_target_maxq_values(self, next_states):
        return PDQFDLearner.get_target_maxq_values(
            self.target_network,
            next_states,
            self.session,
            double_q=self.double_q,
            learning_network=self.network)

    def update_target(self):
        if self.continuous_target_update:
            self.session.run(self.target_network.continuous_sync_nets)
        elif self.global_step % self.target_update_freq == 0:
            params = self.network.get_params(self.session)
            feed_dict = {}
            for i in range(len(self.target_network.params)):
                feed_dict[self.target_network.params_ph[i]] = params[i]
            self.target_network.set_params(feed_dict, self.session)

    def estimate_returns(self, next_state_maxq, rewards, dones):
        estimated_return = next_state_maxq
        done_masks = 1.0 - dones.astype(np.float32)
        y = np.zeros_like(rewards)
        for t in reversed(range(self.n_steps)):
            estimated_return = rewards[:,
                                       t] + self.gamma * estimated_return * done_masks[:,
                                                                                       t]
            y[:, t] = estimated_return
        return y

    def train_from_experience(self):
        if self.prioritized:
            experience = self.replay_buffer.sample_nstep(
                self.beta_schedule.value(self.global_step))
        else:
            experience = self.replay_buffer.sample_nstep()

        (s_t, a, r, s_tp1, dones, imp_weights, idxes) = experience
        next_state_maxq = self.__get_target_maxq_values(s_tp1)
        targets = self.estimate_returns(next_state_maxq, r, dones)

        # RUN TRAIN STEP AND OBTAIN TD ERRORS
        a = np.reshape(a, -1)
        targets = np.reshape(targets, -1)
        lr = self.get_lr()
        feed_dict = {
            self.network.input_ph: self.prepare_input_batch(s_t),
            self.network.target_ph: targets,
            self.network.importance_weights_ph: imp_weights,
            self.network.selected_action_ph: a,
            self.learning_rate: lr
        }

        _, td_errors, summaries = self.session.run(
            [self.train_step, self.network.td_error, self.summaries_op],
            feed_dict=feed_dict)

        self.summary_writer.add_summary(summaries, self.global_step)
        self.summary_writer.flush()

        self.counter += 1

        if self.prioritized:
            new_priorities = np.abs(td_errors) + self.prioritized_eps
            self.replay_buffer.update_priorities(idxes, new_priorities)

    def collect_experience(self):
        var = self.shared_variables
        for t in range(self.n_steps):
            ## Add current state to state buffer (we keep track of the last n_steps visited states to select actions)
            self.states_buffer.append(
                np.reshape(var["s"],
                           (self.n_emulators, 1) + self.state_shape).copy())

            # Select next action based on sequence of states in buffer and pass on to simulators via shared_variables
            var["a"][:] = self.__choose_next_actions()

            # Start updating all environments with next_actions
            self.sim_coordinator.update_environments()
            self.sim_coordinator.wait_updated()
            # Done updating all environments, have new states, rewards and dones in shared_variables

            r = self.rescale_reward(var["r"], type="none")

            # Statistics
            #self.rewards_per_step.append(var['r'].copy())
            self.acc_reward += var['r']
            self.acc_steps += self.one_step

            for emu in range(self.n_emulators):
                self.replay_buffer.add(self.states_buffer[-1][emu].ravel(),
                                       var["a"][emu], r[emu], var["s"][emu],
                                       var["done"][emu], emu)

            for emu in np.where(var["done"] == True)[0]:
                # Reset states buffer for those emulators whose episode has ended
                for i in range(self.n_steps):
                    if self.states_buffer[i][emu, 0, 0] == MASK_VALUE:
                        continue
                    self.states_buffer[i][emu, :, :] = MASK_VALUE

                # Statistics
                #self.n_episodes += 1

                self.n_dones += 1
                self.rewards_per_episode.append(self.acc_reward[emu])
                self.acc_reward[emu] = 0
                self.episode_length.append(self.acc_steps[emu])
                self.acc_steps[emu] = 0

        self.global_step += self.n_emulators * self.n_steps

        if self.global_step % (100 * self.n_steps) == 0:
            if len(self.rewards_per_episode) == 0:
                logger.debug("{} global steps".format(self.global_step))
                return

            #total_reward = np.sum(np.concatenate(self.rewards_per_step))
            #self.rewards_per_step = []

            n_episodes = max(self.n_emulators, self.n_emulators + self.n_dones)
            #n_episodes = self.n_episodes

            #avg_reward_per_episode = total_reward / n_episodes
            avg_reward_per_episode = np.mean(self.rewards_per_episode)
            self.rewards_per_episode = []

            #avg_episode_length = self.global_step / n_episodes
            avg_episode_length = np.mean(self.episode_length)
            self.episode_length = []

            #self.n_episodes = self.n_emulators
            self.n_dones = 0

            logger.debug("{} global steps, "
                         "Avg. reward/episode: {:.2f}, "
                         "Avg. episode length: {:.2f}, "
                         "Epsilon: {:.2f}".format(
                             self.global_step, avg_reward_per_episode,
                             avg_episode_length,
                             self.exp_epsilon.value(self.global_step)))

            stats_summary = tf.Summary(value=[
                tf.Summary.Value(tag='avg_reward_before_churn',
                                 simple_value=avg_reward_per_episode),
                tf.Summary.Value(tag='avg_episode_length',
                                 simple_value=avg_episode_length),
            ])
            self.summary_writer.add_summary(stats_summary, self.global_step)
            self.summary_writer.flush()

    # TODO: to be fixed
    def evaluate_agent(self, msg):
        if self.evaluate:
            assert False, "Evaluate function needs to be fixed"
            if self.eva_env == None:
                self.eva_env = self.environment_creator.create_environment(-1)
            _succ_epi = evaluate(self.eva_env,
                                 self.session,
                                 self.network.output_layer_q,
                                 self.network.input_ph,
                                 self.n_steps,
                                 self.state_shape,
                                 visualize=False,
                                 v_func=self.network.value)
            logger.debug("{}: {:.2f}%".format(msg, _succ_epi))
            perf_summary = tf.Summary(value=[
                tf.Summary.Value(tag="Performance", simple_value=_succ_epi)
            ])
            self.summary_writer.add_summary(perf_summary, self.global_step)
            self.summary_writer.flush()

    def train(self):
        """
        Main actor learner loop for parallel deep Q learning with demonstrations.
        """
        print("STARTING TRAINING")
        # Initialize networks
        self.global_step = self.init_network()
        self.update_target()
        logging.info("Synchronized learning and target networks")

        logger.debug("Resuming training from emulators at Step {}".format(
            self.global_step))
        #self.n_episodes = self.n_emulators
        self.n_dones = 0
        self.rewards_per_step = []
        self.rewards_per_episode = []
        self.acc_reward = np.zeros((self.n_emulators))
        self.episode_length = []
        self.acc_steps = np.zeros((self.n_emulators))
        self.one_step = np.ones((self.n_emulators))

        self.sim_coordinator.update_environments()
        self.sim_coordinator.wait_updated()
        self.shared_variables = self.sim_coordinator.get_shared_variables()

        logger.debug("Shared variables accessible through simulators.")
        logger.debug("Collecting experience and training.")

        while self.global_step < self.max_global_steps:
            self.collect_experience()

            if self.global_step > self.initial_random_steps:
                self.train_from_experience()

                self.update_target()

                self.save_vars()

        self.evaluate_agent("End - Average reward over 100 episodes")

        self.cleanup()

    def cleanup(self):
        super(PDQFDLearner, self).cleanup()
        if self.n_emulators_per_emulator_runner > 0:
            self.sim_coordinator.stop()
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    buffer_size = 50000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 64
    train_freq = 2

    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    tabularQ = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    obs = env.reset()

    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)
        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2],
                         stateCurr[3], :]

        # select action
        action = np.argmax(np.max(qCurr, 0))
        selPatch = np.argmax(np.max(qCurr, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # get next q-values
        stateNext = convertState(getDeicticObs(new_obs))
        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2],
                         stateNext[3], :]

        # perform learning update
        qNextmaxa = np.max(qNext)
        targets = rew + (1 - done) * gamma * qNextmaxa

        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            #            print("************************* Episode done! **************************")
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) +
                  ", max q at curr state: " + str(np.max(qCurr)))

        obs = new_obs
Esempio n. 13
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps):

    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    # Create environment and set stride parameters for this problem instance.
    # Most of the time, these two stride parameters will be equal. However,
    # one might use a smaller stride for initial placement and a larger stride
    # for action specification in order to speed things up. Unfortunately, this
    # could cause the problem to be infeasible: no grasp might work for a given
    # initial setup.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride # stride for initial puck placement
    env.stride = envStride # stride for action specification
    
    # Standard q-learning parameters
    reuseModels = None
    max_timesteps=inputmaxtimesteps
    exploration_fraction=0.5
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=60
    buffer_size=1000
    batch_size=32
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (env.blockSize*3,env.blockSize*3,2)
#    descriptorShapeSmall = (10,10,2)
#    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20,20,2)
    num_states = 2 # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2*num_patches*env.num_orientations

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
#    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
#                                 initial_p=exploration_final_eps,
#                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by 
    # setting the line below to False
    prioritized_replay=True
#    prioritized_replay=False
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(
        convs=[(16,3,1)],
        hiddens=[32],
        dueling=True
    )

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)
    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)
    def make_target_ph(name):
        return U.BatchInput([1], name=name)
    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride)
    getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride)
    
    getqNotHolding = build_getq(
            make_actionDeic_ph=make_actionDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="q_func_notholding",
            reuse=reuseModels
            )
    getqHolding = build_getq(
            make_actionDeic_ph=make_actionDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="q_func_holding",
            reuse=reuseModels
            )

    targetTrainNotHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_notholding",
        grad_norm_clipping=1.,
        reuse=reuseModels
    )

    targetTrainHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_holding",
        grad_norm_clipping=1.,
        reuse=reuseModels
    )
    
    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([2,])
    
    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    
    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
#        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = getMoveActionDescriptorsRot([obs[0]])
        
        moveDescriptors = moveDescriptors*2-1
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        # Get qCurr. I split up pick and place in order to accomodate larger batches
        qCurrNotHoldingPick = getqNotHolding(actionsPickDescriptors)
        qCurrHoldingPick = getqHolding(actionsPickDescriptors)
        qCurrNotHoldingPlace = getqNotHolding(actionsPlaceDescriptors)
        qCurrHoldingPlace = getqHolding(actionsPlaceDescriptors)
        qCurr = np.concatenate([np.r_[qCurrNotHoldingPick,qCurrNotHoldingPlace],np.r_[qCurrHoldingPick,qCurrHoldingPlace]],axis=1)

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr[:,obs[1]])
        V[obs[1]] = (1-lrState) * V[obs[1]] + lrState * thisStateValues

        # Select e-greedy action to execute
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:,obs[1]])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # Execute action
        new_obs, rew, done, _ = env.step(action)        
        replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptors[action,:]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actionPatches, rewards, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1-dones) * gamma * V[states_tp1]
            
            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)
            td_error = qCurrTarget[range(batch_size),states_t] - targets
            qCurrTarget[range(batch_size),states_t] = targets

            # Train
            targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            print("time to do training: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV,V)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptorsRot([obs[0]])
    moveDescriptors = moveDescriptors*2-1
#    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))
    gridSize = len(env.moveCenters)
    
    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    
    print("Value function for pick action for rot0 in hold-0 state:")
    print(str(np.reshape(qPick[:gridSize**2,0],[gridSize,gridSize])))
    print("Value function for pick action for rot1 in hold-0 state:")
    print(str(np.reshape(qPick[gridSize**2:2*gridSize**2,0],[gridSize,gridSize])))
    print("Value function for pick action for rot2 in hold-0 state:")
    print(str(np.reshape(qPick[2*gridSize**2:3*gridSize**2,0],[gridSize,gridSize])))
    print("Value function for pick action for rot3 in hold-0 state:")
    print(str(np.reshape(qPick[3*gridSize**2:4*gridSize**2,0],[gridSize,gridSize])))
        
#    print("Value function for pick action in hold-nothing state:")
#    print(str(np.reshape(qPick[:,0],[gridSize,gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)
    
    print("Value function for place action for rot0 in hold-1 state:")
    print(str(np.reshape(qPlace[:gridSize**2,1],[gridSize,gridSize])))
    print("Value function for place action for rot1 in hold-1 state:")
    print(str(np.reshape(qPlace[gridSize**2:2*gridSize**2,1],[gridSize,gridSize])))
    print("Value function for place action for rot2 in hold-1 state:")
    print(str(np.reshape(qPlace[2*gridSize**2:3*gridSize**2,1],[gridSize,gridSize])))
    print("Value function for place action for rot3 in hold-1 state:")
    print(str(np.reshape(qPlace[3*gridSize**2:4*gridSize**2,1],[gridSize,gridSize])))    

#    print("Value function for place action in hold-1 state:")
#    print(str(np.reshape(qPlace[:,1],[gridSize,gridSize])))
    
    plt.subplot(2,5,1)
    plt.imshow(np.tile(env.state[0],[1,1,3]),interpolation=None)
    plt.subplot(2,5,2)
    plt.imshow(np.reshape(qPick[:gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,3)
    plt.imshow(np.reshape(qPick[gridSize**2:2*gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,4)
    plt.imshow(np.reshape(qPick[2*gridSize**2:3*gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,5)
    plt.imshow(np.reshape(qPick[3*gridSize**2:4*gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,7)
    plt.imshow(np.reshape(qPlace[:gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,8)
    plt.imshow(np.reshape(qPlace[gridSize**2:2*gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,9)
    plt.imshow(np.reshape(qPlace[2*gridSize**2:3*gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.subplot(2,5,10)
    plt.imshow(np.reshape(qPlace[3*gridSize**2:4*gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12)
    plt.show()
Esempio n. 14
0
class Agent:
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = ddpg_nets_dm

        tau = FLAGS.tau
        discount = FLAGS.discount
        pl2norm = FLAGS.pl2norm
        l2norm = FLAGS.l2norm
        plearning_rate = FLAGS.prate
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        # init replay memory

        if FLAGS.use_per:
            self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha)
            self.beta_schedule = LinearSchedule(FLAGS.beta_iters,
                                                initial_p=FLAGS.beta0,
                                                final_p=1.0)
        else:
            self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)

        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma))
        act_expl = act_test + noise

        # test
        q = nets.qfunction(obs, act_test, self.theta_q)
        # training

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        # experience replay
        per_weight = tf.placeholder(tf.float32, [None], "per_weight")

        # policy loss
        act_train_policy = nets.policy(obs, self.theta_p)
        q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q)
        meanq = tf.reduce_mean(q_train_policy, 0)
        wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var) for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate, epsilon=1e-4)
        grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q
        q_train = nets.qfunction(obs, act_train, self.theta_q)

        # q targets
        act2 = nets.policy(obs2, theta=self.theta_pt)
        q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.where(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target

        if FLAGS.use_per:
            ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0)
        else:
            ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4)
        grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)


        summary_path = os.path.join(model_path, 'board', FLAGS.exp_id)
        summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph)

        if FLAGS.summary:
            tf.summary.scalar('Qvalue', tf.reduce_mean(q_train))
            tf.summary.scalar('loss', ms_td_error)
            tf.summary.scalar('reward', tf.reduce_mean(rew))
        merged = tf.summary.merge_all()

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train = Fun([obs, act_train, rew, obs2, term2, per_weight], [train_p, train_q,
                loss_q, td_error, q, q_target],
                 merged, summary_writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(model_path + "/tf")
        if not FLAGS.force and ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self._act_test(obs) if test else self._act_expl(obs)
        action = np.clip(action, -1, 1)
        self.action = np.atleast_1d(np.squeeze(action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            if FLAGS.use_per:
                self.rm.add(obs1, self.action, rew, obs2, float(term))
            else:
                self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        if FLAGS.use_per:
            experience = self.rm.sample(FLAGS.bsize, beta=self.beta_schedule.value(self.t))
            (obs, act, rew, ob2, term2, weights, batch_idxes) = experience
        else:
            obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)

        _, _, loss, td_error, _, _ = self._train(obs, act, rew, ob2, term2, weights,
                                                 log=FLAGS.summary,
                                                 global_step=self.t)
        return loss

    def __del__(self):
        self.sess.close()