Ejemplo n.º 1
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 20000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 4

    deicticShape = (3, 3, 4)
    num_deictic_patches = 36

    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):

        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36, 9, 4))
        for i in range(4):
            obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
        state_numeric = 9 * np.ones(
            (4, shape[0])
        )  # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2] == i)[0]
            state_numeric[i, pos[0][idx]] = pos[1][idx]
#            state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i]

        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    lr = 1e-3

    def make_obs_ph(name):
        return U.BatchInput(deicticShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_cascaded(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_cascade=num_cascade,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    dimSize = deicticShape[0] * deicticShape[1] + 1
    tabularQ = 1 * np.ones(
        (dimSize, dimSize, dimSize, dimSize, num_cascade, num_actions))

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)

        #        # Get current q-values: tabular version
        #        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],-1,:]

        # Get current q-values: neural network version
        qCurr = getq(np.array(obsDeictic))[:, -1, :]

        # select action
        qCurrNoise = qCurr + np.random.random(
        ) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            obs_resize_to_network = [
                batch_size * num_deictic_patches, deicticShape[0],
                deicticShape[1], deicticShape[2]
            ]
            q_resize_from_network = [
                batch_size, num_deictic_patches, num_cascade, num_actions
            ]
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            obses_t_deic = getDeicticObsBatch(obses_t)
            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            #            # Get curr, next values: tabular version
            #            stateNext = convertStateBatch(obses_tp1_deic)
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],-1,:]
            #            stateCurr = convertStateBatch(obses_t_deic)
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:]

            # Get curr, next values: neural network version
            qNext = np.reshape(
                getq(np.reshape(obses_tp1_deic, obs_resize_to_network)),
                q_resize_from_network)[:, :, -1, :]
            qCurr = np.reshape(
                getq(np.reshape(obses_t_deic, obs_resize_to_network)),
                q_resize_from_network)

            # Get "raw" targets (no masking for cascade levels)
            qNextmax = np.max(np.max(qNext, 2), 1)
            targetsRaw = rewards + (1 - dones) * gamma * qNextmax
            targetsTiled = np.tile(np.reshape(targetsRaw, [batch_size, 1, 1]),
                                   [1, num_deictic_patches, num_cascade])

            # Get qCurrActionSelect
            actionsTiled = np.tile(np.reshape(actions, [batch_size, 1, 1]),
                                   [1, num_deictic_patches, num_cascade])
            qCurrActionSelect = np.zeros(
                (batch_size, num_deictic_patches, num_cascade))
            for i in range(num_actions):
                qCurrActionSelect += (actionsTiled == i) * qCurr[:, :, :, i]

            # Get targets masked for cascade level
            targetMask = targetsTiled < qCurrActionSelect
            targets = np.zeros((batch_size, num_deictic_patches, num_cascade))
            targets[:, :, 0] = targetsTiled[:, :, 0]
            targets[:, :, 1] = targetMask[:, :, 0] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 0]) * qCurrActionSelect[:, :, 1]
            targets[:, :, 2] = targetMask[:, :, 1] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 1]) * qCurrActionSelect[:, :, 2]
            targets[:, :, 3] = targetMask[:, :, 2] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 2]) * qCurrActionSelect[:, :, 3]
            targets[:, :, 4] = targetMask[:, :, 3] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 3]) * qCurrActionSelect[:, :, 4]

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actionsTiled == i
                qCurrTargets[:, :, :, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, :, :, i]


#            # Update values: tabular version
#            tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] = \
#                (1 - learning_alpha) * tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] \
#                + learning_alpha * targets

# Update values: neural network version
            targets_resize_to_network = [
                batch_size * num_deictic_patches, num_cascade, num_actions
            ]
            td_error_out, obses_out, targets_out = targetTrain(
                np.reshape(obses_t_deic, obs_resize_to_network),
                np.reshape(qCurrTargets, targets_resize_to_network))

            td_error_pre = qCurrActionSelect - targets
            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            #            # tabular version
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:]

            # neural network version
            qCurr = np.reshape(
                getq(np.reshape(obses_t_deic, obs_resize_to_network)),
                q_resize_from_network)

            qCurrActionSelect_post = np.zeros(
                (batch_size, num_deictic_patches, num_cascade))
            for i in range(num_actions):
                qCurrActionSelect_post += (actionsTiled == i) * qCurr[:, :, :,
                                                                      i]

            td_error_post = qCurrActionSelect_post - targets
            #            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

            if -1 in rewards:
                dones

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Ejemplo n.º 2
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    #    deicticShape = (3,3,2)
    #    deicticShape = (4,4,1)
    #    deicticShape = (4,4,2)
    deicticShape = (4, 4, 3)
    #    deicticShape = (3,3,4)
    num_deictic_patches = 25

    #    num_actions = 4
    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        shape = np.shape(deicticObsBatch)
        return (np.reshape(
            np.array(deicticObsBatch),
            [shape[0] * shape[1], shape[2], shape[3], shape[4]]))

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
#    model = models.cnn_to_mlp(
#        convs=[(16,4,1)],
#        hiddens=[16],
#        dueling=True
#    )

# MLP version
    model = models.mlp([16, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        #        return U.BatchInput(deicticShape, name=name)

        # MLP version
        return U.BatchInput(
            [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade)

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])
        #        obsDeictic, patchesTiledStacked2 = getDeic([obs])

        #        # CNN version
        #        qCurr = getq(np.array(obsDeictic))

        # MLP version
        qCurr = getq(
            np.reshape(
                obsDeictic,
                [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeicticObsBatch(obses_t)
            #            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            #            # Get curr, next values: CNN version
            #            qNext = getq(obses_tp1_deic)
            #            qCurr = getq(obses_t_deic)

            # Get curr, next values: MLP version
            qNext = getq(
                np.reshape(
                    obses_tp1_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))
            qCurr = getq(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]


#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )

# MLP version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]),
                qCurrTargets)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Ejemplo n.º 3
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 20000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    #    batch_size=2
    train_freq = 4
    #    train_freq=8

    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 3
    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):

        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36, 9, 4))
        for i in range(4):
            obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
        state_numeric = 9 * np.ones(
            (4, shape[0])
        )  # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2] == i)[0]
            state_numeric[i, pos[0][idx]] = pos[1][idx]


#            state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i]

        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    dimSize = deicticShape[0] * deicticShape[1] + 1
    #    tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ1 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ2 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ3 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ4 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])
    tabularQ5 = 100 * np.ones(
        [dimSize, dimSize, dimSize, dimSize, num_actions])

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()
    #    OHEnc = np.identity(max_num_groups)

    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
        qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2],
                          stateCurr[3], :]

        # select action
        qCurrNoise = qCurr + np.random.random(
        ) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        item = (obs, action, rew, new_obs, float(done))
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # debug
        if t > max_timesteps * 1.1:
            print("obs:\n" + str(np.squeeze(obs)))
            print("qCurr:\n" + str(qCurr))
            print("action: " + str(action) + ", patch: " + str(selPatch))
            print("close:\n" + str(obsDeictic[selPatch, :, :, 0] +
                                   obsDeictic[selPatch, :, :, 1]))
            print("far:\n" + str(obsDeictic[selPatch, :, :, 2] +
                                 obsDeictic[selPatch, :, :, 3]))
            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            stateCurr = convertStateBatch(getDeicticObsBatch(obses_t))
            stateNext = convertStateBatch(getDeicticObsBatch(obses_tp1))
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:]
            qNext = tabularQ5[stateNext[:, 0, :], stateNext[:, 1, :],
                              stateNext[:, 2, :], stateNext[:, 3, :], :]
            qNextmax = np.max(np.max(qNext, 2), 1)
            targets = rewards + (1 - dones) * gamma * qNextmax
            targetsTiled = np.tile(np.reshape(targets, [batch_size, 1]),
                                   [1, num_deictic_patches])
            actionsTiled = np.tile(np.reshape(actions, [batch_size, 1]),
                                   [1, num_deictic_patches])

            #            tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = np.minimum(targetsTiled, tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled])

            target2_mask = targetsTiled < tabularQ1[stateCurr[:, 0, :],
                                                    stateCurr[:, 1, :],
                                                    stateCurr[:, 2, :],
                                                    stateCurr[:, 3, :],
                                                    actionsTiled]
            target3_mask = targetsTiled < tabularQ2[stateCurr[:, 0, :],
                                                    stateCurr[:, 1, :],
                                                    stateCurr[:, 2, :],
                                                    stateCurr[:, 3, :],
                                                    actionsTiled]
            target4_mask = targetsTiled < tabularQ3[stateCurr[:, 0, :],
                                                    stateCurr[:, 1, :],
                                                    stateCurr[:, 2, :],
                                                    stateCurr[:, 3, :],
                                                    actionsTiled]
            target5_mask = targetsTiled < tabularQ4[stateCurr[:, 0, :],
                                                    stateCurr[:, 1, :],
                                                    stateCurr[:, 2, :],
                                                    stateCurr[:, 3, :],
                                                    actionsTiled]
            targets1 = targetsTiled
            targets2 = target2_mask * targetsTiled + (
                1 - target2_mask
            ) * tabularQ2[stateCurr[:, 0, :], stateCurr[:, 1, :],
                          stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled]
            targets3 = target3_mask * targetsTiled + (
                1 - target3_mask
            ) * tabularQ3[stateCurr[:, 0, :], stateCurr[:, 1, :],
                          stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled]
            targets4 = target4_mask * targetsTiled + (
                1 - target4_mask
            ) * tabularQ4[stateCurr[:, 0, :], stateCurr[:, 1, :],
                          stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled]
            targets5 = target5_mask * targetsTiled + (
                1 - target5_mask
            ) * tabularQ5[stateCurr[:, 0, :], stateCurr[:, 1, :],
                          stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled]

            tabularQ1[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \
                (1 - learning_alpha) * tabularQ1[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \
                + learning_alpha * targets1
            tabularQ2[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \
                (1 - learning_alpha) * tabularQ2[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \
                + learning_alpha * targets2
            tabularQ3[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \
                (1 - learning_alpha) * tabularQ3[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \
                + learning_alpha * targets3
            tabularQ4[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \
                (1 - learning_alpha) * tabularQ4[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \
                + learning_alpha * targets4
            tabularQ5[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \
                (1 - learning_alpha) * tabularQ5[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \
                + learning_alpha * targets5

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))

        obs = new_obs
Ejemplo n.º 4
0
def main():


    env = envstandalone.BallCatch()
    
    max_timesteps=20000
    buffer_size=50000
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=10
    learning_starts=1000
    gamma=.98
    target_network_update_freq=500
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=2

    deicticShape = (3,3,1)
    num_deictic_patches=36

    num_actions = 3
    episode_rewards = [0.0]

#    replay_buffer = ReplayBuffer(buffer_size)
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros((obsShape[0]+2*windowLen,obsShape[1]+2*windowLen))
        obsPadded[windowLen:windowLen+obsShape[0],windowLen:windowLen+obsShape[1]] = obs[:,:,0]
        deicticObsThis = np.zeros((windowLen,windowLen,4)) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:,:,0] = obs[i:i+windowLen,j:j+windowLen,0] == 1 # agent zoomin
                deicticObsThis[:,:,1] = obs[i:i+windowLen,j:j+windowLen,0] == 2 # ball zoomin
                patch = obsPadded[i:i+3*windowLen,j:j+3*windowLen]
                for k in range(1,3):
# THE VERSION BELOW USES A FIXED VIEW
#                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], 
#                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
#                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
# THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], 
                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], 
                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
# THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
#                    deicticObsThis[:,:,k+1] = [[(k in patch[0:3,0:3]), (k in patch[0:3,3:6]), (k in patch[0:3,6:9])], 
#                                 [(k in patch[3:6,0:3]), (k in patch[3:6,3:6]), (k in patch[3:6,6:9])], 
#                                 [(k in patch[6:9,0:3]), (k in patch[6:9,3:6]), (k in patch[6:9,6:9])]]
                deicticObs.append(deicticObsThis.copy()) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)


    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return(np.array(deicticObsBatch))


    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):
        
        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36,9,4))
        for i in range(4):
            obs[:,:,i] = np.reshape(observations[:,:,:,i],[36,9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
#        state_numeric = 9*np.ones((4,shape[0])) # 9 indicates agent/ball does not appear at this zoom in this glance
        state_numeric = 9*np.ones((shape[0],4)) # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2]==i)[0]
#            state_numeric[i,pos[0][idx]] = pos[1][idx]
            state_numeric[pos[0][idx],i] = pos[1][idx]
        
        return np.int32(state_numeric)


    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return(np.array(state_numeric_batch))


    dimSize = deicticShape[0]*deicticShape[1] + 1
    tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ1 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ2 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ3 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ4 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ5 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
    
    obs = env.reset()
#    OHEnc = np.identity(max_num_groups)


    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)

#        # do a couple of spot checks to verify that obsDeictic is correct
#        num2check = 17
#        print(str(obsDeictic[num2check,:,:,0] + obsDeictic[num2check,:,:,1]))
#        print(str(obsDeictic[num2check,:,:,2] + obsDeictic[num2check,:,:,3]))

#        qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
        
        # select action
        qCurrNoise = qCurr + np.random.random()*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise,0))
        selPatch = np.argmax(np.max(qCurrNoise,1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)


#        env.render()
#        print("action: " + str(action))

        # take action
        new_obs, rew, done, _ = env.step(action)

#        replay_buffer.add(obs, action, rew, new_obs, float(done))
        
#        if done == 1:
#            print("action: " + str(action) + ", patch: " + str(selPatch) + ", reward: " + str(rew))
#            action
            
        if t > max_timesteps * 1.05:
            print("obs:\n" + str(np.squeeze(obs)))
            print("qCurr:\n" + str(qCurr))
            print("action: " + str(action) + ", patch: " + str(selPatch))
            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
            action
        
#        if t > learning_starts and t % train_freq == 0:
#            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
            
#        obses_t = np.reshape(obs,[1,8,8,1])
#        obses_tp1 = np.reshape(new_obs,[1,8,8,1])
#        stateCurr = convertStateBatch(getDeicticObsBatch(obses_t))
#        stateNext = convertStateBatch(getDeicticObsBatch(obses_tp1))
#        qNext = tabularQ[stateNext[:,:,0], stateNext[:,:,1], stateNext[:,:,2], stateNext[:,:,3],:]
#        qNextmax = np.max(np.max(qNext,2),1)
#        targets = rew + (1-done) * gamma * qNextmax
#        batch_size = 1
#        targets = np.tile(np.reshape(targets,[batch_size,1]),[1,num_deictic_patches])
#        tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action] = np.minimum(targets, tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action])
            
        stateNext = convertState(getDeicticObs(new_obs))
        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]
        qNextmax = np.max(qNext)
        targets = rew + (1-done) * gamma * qNextmax
        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])
        
            
#        # get next q-values
#        stateNext = convertState(getDeicticObs(new_obs))
#        qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]
#        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]
        
#        # perform learning update
#        qNextmax = np.max(qNext)
#        targets = rew + (1-done) * gamma * qNextmax
        
#        max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
#        if max_negative_td_error > 5:
#            max_negative_td_error
#        print("max_td_error: " + str(max_negative_td_error))
#        print("curr tabularQ:\n" + str(tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
#        print("targets:\n" + str(targets))        
#        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])
        
#        target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets1 = targets
#        targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        
#        tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets1
#        tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets2
#        tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets3
#        tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets4
#        tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets5


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
#            print("************************* Episode done! **************************")
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
        
        obs = new_obs
        
        # stop at the end of training
        if t > max_timesteps * 1.1:
#            np.set_printoptions(precision=1)
#            np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
            np.set_printoptions(formatter={'float_kind':lambda x: "%.1f" % x})
            
#            qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape]
#            todisplay = np.c_[qCurr5,np.transpose(stateCurr)]

            print("obs:\n" + str(np.squeeze(obs)))

#            todisplay = np.c_[np.max(qCurr5,1),np.transpose(stateCurr)]
#            print("q-values:\n" + str(todisplay))
#
#            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
#            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
#            print("action: " + str(action) + ", patch: " + str(selPatch))
            action

#                print("obs:\n" + str(np.squeeze(obs)))
#                print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3))))
#                print("action: " + str(action) + ", patch: " + str(selPatch))
#                t
            
            

    t
Ejemplo n.º 5
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 20000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,4)
    #    num_deictic_patches=36

    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        #        return U.BatchInput(deicticShape, name=name)
        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current q-values: neural network version
        qCurr = getq(np.array([obs]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise, 1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            actions = np.int32(np.reshape(actions, [
                batch_size,
            ]))

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

            # Get targets
            qNextmax = np.max(qNext, 1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, i]

            # Update values: neural network version
            td_error_out, obses_out, targets_out = targetTrain(
                obses_t, qCurrTargets)

            td_error_pre = qCurr[range(batch_size), actions] - targets

            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            # neural network version
            qCurr = getq(obses_t)

            td_error_post = qCurr[range(batch_size), actions] - targets


#            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs