Example #1
0
def main():

    env = envstandalone.MultiGhostEvade()
#    env = envstandalone.GhostEvade()
#    env = envstandalone.BallCatch()
    
    max_timesteps=40000
#    max_timesteps=80000
    learning_starts=1000
#    buffer_size=50000
    buffer_size=1000
#    exploration_fraction=0.2
    exploration_fraction=0.4
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
#    batch_size=32
#    batch_size=64
    batch_size=512
#    batch_size=1024
    train_freq=1

    obsShape = (8,8,1)
#    deicticShape = (3,3,2)
#    deicticShape = (3,3,4)
#    deicticShape = (4,4,2)
#    deicticShape = (4,4,4)
    deicticShape = (5,5,2)
#    deicticShape = (6,6,2)
#    deicticShape = (8,8,2)
#    num_deictic_patches = 36
#    num_deictic_patches = 25
    num_deictic_patches = 16
#    num_deictic_patches = 9
#    num_deictic_patches = 1

#    num_actions = 4
#    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu=16
    num_cascade = 5
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)


#    # CNN version
#    # conv model parameters: (num_outputs, kernel_size, stride)
#    model = models.cnn_to_mlp(
###    model = models.cnn_to_mlp_2pathways(
###        convs=[(16,3,1)],
#        convs=[(32,3,1)],
###        convs=[(32,4,1)],
###        convs=[(16,4,1)],
##        hiddens=[16],
#        hiddens=[32],
#        dueling=True
#    )
    
    # MLP version
#    model = models.mlp([8, 16])
#    model = models.mlp([16, 16])
#    model = models.mlp([16, 32])
#    model = models.mlp([16, 16])
#    model = models.mlp([32, 32])
#    model = models.mlp([32])
    model = models.mlp([])

    q_func=model
#    lr=0.01
    lr=0.001
#    lr=0.0005
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)
    
    def make_obsDeic_ph(name):

#        # CNN version
#        return U.BatchInput(deicticShape, name=name)
        
        # MLP version
        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func"
            )
    
    getqTarget = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func_target"
            )

    update_target = build_update_target(scope="deepq", 
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")
                      
    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
#        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func",
        grad_norm_clipping=1.
#        grad_norm_clipping=0.1
    )
    
    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
#    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()
    
    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])

##       CNN version
#        qCurr = getq(np.array(obsDeictic))
        
        # MLP version
        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))


        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
#        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # MONTE CARLO VERSION
        # update rewards to actual monte carlo experiences
        if done:
            replay_buffer.update_montecarlo(gamma)
            
        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
#            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
#            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]
            
            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones,num_deictic_patches)
            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            actionsTiled = np.repeat(actions,num_deictic_patches)
            
#            # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION 
#            qNextTarget = getqTarget(obses_tp1_deic)
#            qNext = getq(obses_tp1_deic)
#            qCurr = getq(obses_t_deic)
            
            # Get curr, next values: MLP version
            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

#            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
#            obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2))
#            obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2))
#            obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2))
#            obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3]
#            obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2))
#            obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2))
#            obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2))
#            obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3]
#            qCurr = getq(np.array(obses_t_deic))
#            qNext = getq(np.array(obses_tp1_deic))
#            actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3]
#            actionsTiled = actionsTiled - 4 * (actionsTiled>3)
#            rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled]
#            donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled]            
            
            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:,-1,:],1) # standard
#            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
#            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]
            
#            # This version takes the max over all glimpses
#            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
#            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # BELLMAN VERSION
            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax

            # MONTE CARLO VERSION
            targets = rewardsTiled

#            # Take min over targets in same group
#            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
#            for i in range(np.shape(uniqueCounts)[0]):
#                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])
            
            
            qCurrTargets = np.copy(qCurr)
            
            # Copy into cascade with pruning.
            expLen = np.shape(qCurr)[0]
            qCurrTargets[range(expLen),0,actionsTiled] = targets
            for i in range(num_cascade-1):
                mask = targets < qCurrTargets[range(expLen),i,actionsTiled]
                qCurrTargets[range(expLen),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled]
            
#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )
            
            # MLP version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
                    qCurrTargets
                    )
                
        # Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Example #2
0
def main():

    #    env = envstandalone.BallCatch()
    env = envstandalone.MultiGhostEvade()
    #    env = envstandalone.GhostEvade()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,4)
    #    num_deictic_patches=36

    num_actions = env.action_space.n
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        #        return U.BatchInput(deicticShape, name=name)
        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current q-values: neural network version
        qCurr = getq(np.array([obs]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise, 1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            actions = np.int32(np.reshape(actions, [
                batch_size,
            ]))

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

            # Get targets
            qNextmax = np.max(qNext, 1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, i]

            # Update values: neural network version
            td_error_out, obses_out, targets_out = targetTrain(
                obses_t, qCurrTargets)

            td_error_pre = qCurr[range(batch_size), actions] - targets

            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            # neural network version
            qCurr = getq(obses_t)

            td_error_post = qCurr[range(batch_size), actions] - targets


#            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Example #3
0
def main():

    env = envstandalone.MultiGhostEvade()
#    env = envstandalone.GhostEvade()
#    env = envstandalone.BallCatch()
    
#    max_timesteps=40000
    max_timesteps=80000
    learning_starts=1000
    buffer_size=50000
#    exploration_fraction=0.2
    exploration_fraction=0.4
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
    batch_size=32
#    batch_size=64
#    batch_size=1024
    train_freq=1

#    obsShape = (8,8,1)
    obsShape = env.observation_space.shape
#    deicticShape = (3,3,2)
#    deicticShape = (3,3,4)
#    deicticShape = (4,4,2)
#    deicticShape = (4,4,4)
    deicticShape = (5,5,2)
#    deicticShape = (6,6,2)
#    deicticShape = (8,8,2)
#    num_deictic_patches = 36
#    num_deictic_patches = 25
    num_deictic_patches = 16
#    num_deictic_patches = 9
#    num_deictic_patches = 1

    num_cascade = 5
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu=16
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Dictionary-based value function
    q_func = {}
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def getTabularKeys(obsDeictic):
        obsDeicticTiled = np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
        obsBits = np.packbits(obsDeicticTiled,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the type cast below (UINT64) must be large enough to support the size of obsBits
            # if it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(obsDeictic):
        keys = getTabularKeys(obsDeictic)
        return np.array([q_func[x] if x in q_func else 1000*np.ones([num_cascade,num_actions]) for x in keys])
    
    def trainTabular(obsDeictic,qCurrTargets):
        keys = getTabularKeys(obsDeictic)
        alpha=0.5
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func[keys[i]] = qCurrTargets[i]


    sess = U.make_session(num_cpu)
    sess.__enter__()

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current obervations
        obsDeictic = getDeic([obs])
        qCurr = getTabular(obsDeictic)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # Get next obervations
        obsNextDeictic = getDeic([new_obs])
        qNext = getTabular(obsNextDeictic)

        # Calculate TD target
        qNextmax = np.max(qNext[:,-1,:],1) # USE CASCADE
        targets = rew + (1-done) * gamma * qNextmax

        # Update dictionary value function
        qCurrTargets = np.copy(qCurr)

        # Copy into cascade with pruning.
        qCurrTargets[:,0,action] = targets
        for i in range(num_cascade-1):
            mask = targets < qCurr[:,i,action]
            qCurrTargets[:,i+1,action] = \
                mask*targets + \
                (1-mask)*qCurr[:,i+1,action]
        
#        qCurrTargets[:,action] = np.minimum(targets,qCurrTargets[:,action])
        
        
        trainTabular(obsDeictic,qCurrTargets)

        if t > 3000:
            obsDeictic

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
def main():

    env = envstandalone.MultiGhostEvade()
    #    env = envstandalone.GhostEvade()
    #    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    #    target_network_update_freq=500
    #    target_network_update_freq=100
    #    target_network_update_freq=10
    target_network_update_freq = 1
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    obsShape = (8,8,2)
    #    deicticShape = (3,3,2)
    #    deicticShape = (3,3,4)
    #    deicticShape = (4,4,2)
    #    deicticShape = (4,4,4)
    deicticShape = (8, 8, 2)
    #    num_deictic_patches = 36
    #    num_deictic_patches = 25
    num_deictic_patches = 1

    #    num_actions = 4
    #    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #    model = models.cnn_to_mlp_2pathways(
        #        convs=[(16,3,1)],
        convs=[(32, 3, 1)],
        #        convs=[(32,4,1)],
        #        convs=[(16,4,1)],
        hiddens=[16],
        dueling=True)

    # MLP version
    #    model = models.mlp([8, 16])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([16, 32])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([32, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):
        return U.BatchInput(deicticShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq_DQN(make_obs_ph=make_obsDeic_ph,
                          q_func=q_func,
                          num_actions=num_actions)

    targetTrain = build_targetTrain_DQN(
        make_obs_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    get_2channelobs = build_get_2channelobs(make_obs_ph=make_obs_ph)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obs2channel = get_2channelobs([obs])

        # CNN version
        #        qCurr = getq(np.array([obs]))
        qCurr = getq(np.array(obs2channel))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise, 1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            actions = np.int32(np.reshape(actions, [
                batch_size,
            ]))

            obses_t_deic = get_2channelobs(obses_t)
            obses_tp1_deic = get_2channelobs(obses_tp1)

            #            # Put observations in deictic form
            #            obses_t_deic = getDeic(obses_t)
            #            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
            #            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]
            #
            #            # Reshape everything to (1152,) form
            #            donesTiled = np.repeat(dones,num_deictic_patches)
            #            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            #            actionsTiled = np.repeat(actions,num_deictic_patches)

            # Get curr, next values: CNN version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)
            #            qNext = getq(obses_tp1)
            #            qCurr = getq(obses_t)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext, 1)  # standard
            #            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
            #            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            #            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax
            targets = rewards + (1 - dones) * gamma * qNextmax

            #            # Take min over targets in same group
            #            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            #            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
            #            for i in range(np.shape(uniqueCounts)[0]):
            #                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])

            #            qCurrTargets = np.copy(qCurr)
            #            qCurrTargets[:,np.int32(actions)] = targets
            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, i]

#            # Copy into cascade with pruning.
#            qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets
#            for i in range(num_cascade-1):
#                mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled]
#                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
#                    mask*targets + \
#                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

# CNN version
            td_error_out = targetTrain(obses_t_deic, qCurrTargets)


#                    obses_t_deic,

#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )

#        # Update target network periodically.
#        if t > learning_starts and t % target_network_update_freq == 0:
#            update_target()

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs